GSS Data

In [73]:
# Loading the data using R provided
library(foreign)
  read.dct <- function(dct, labels.included = "yes") {
      temp <- readLines(dct)
      temp <- temp[grepl("_column", temp)]
      switch(labels.included,
             yes = {
                 pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+)[a-z]\\s+(.*)"
                 classes <- c("numeric", "character", "character", "numeric", "character")
                 N <- 5
                 NAMES <- c("StartPos", "Str", "ColName", "ColWidth", "ColLabel")
             },
             no = {
                 pattern <- "_column\\(([0-9]+)\\)\\s+([a-z0-9]+)\\s+(.*)\\s+%([0-9]+).*"
                 classes <- c("numeric", "character", "character", "numeric")
                 N <- 4
                 NAMES <- c("StartPos", "Str", "ColName", "ColWidth")
             })
      temp_metadata <- setNames(lapply(1:N, function(x) {
          out <- gsub(pattern, paste("\\", x, sep = ""), temp)
          out <- gsub("^\\s+|\\s+$", "", out)
          out <- gsub('\"', "", out, fixed = TRUE)
          class(out) <- classes[x] ; out }), NAMES)
      temp_metadata[["ColName"]] <- make.names(gsub("\\s", "", temp_metadata[["ColName"]]))
      temp_metadata
  }

  read.dat <- function(dat, metadata_var, labels.included = "yes") {
      read.fwf(dat, widths = metadata_var[["ColWidth"]], col.names = metadata_var[["ColName"]])
  }


GSS_metadata <- read.dct("GSS.dct")
GSS_ascii <- read.dat("GSS.dat", GSS_metadata)
attr(GSS_ascii, "col.label") <- GSS_metadata[["ColLabel"]]
GSS <- GSS_ascii
head(GSS)
A data.frame: 6 × 46
BALLOTRINCOMEINCOMEEARNRSUNRELATADULTSTEENSPRETEENBABIESHOMPOPFAMILY16MOBILE16REG16RES16RACESEXDEGREEEDUCAGEYEAR
<int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
10001-110001432512316231972
20000 020002823611010701972
30002 021104123612112481972
40002 020002130612317271972
50001 020002123312112611972
60001-110001113611114261972

Data cleansing and preparation

In [74]:
# lowercase the GSS column names to match mappings values provided
for( i in colnames(GSS)){
    colnames(GSS)[which(colnames(GSS)==i)] = tolower(i)
 }

head(GSS)
summary(GSS)
A data.frame: 6 × 46
ballotrincomeincomeearnrsunrelatadultsteenspreteenbabieshompopfamily16mobile16reg16res16racesexdegreeeducageyear
<int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int><int>
10001-110001432512316231972
20000 020002823611010701972
30002 021104123612112481972
40002 020002130612317271972
50001 020002123312112611972
60001-110001113611114261972
     ballot         rincome           income          earnrs     
 Min.   :0.000   Min.   : 0.000   Min.   : 0.00   Min.   :0.000  
 1st Qu.:0.000   1st Qu.: 0.000   1st Qu.: 9.00   1st Qu.:1.000  
 Median :1.000   Median : 8.000   Median :12.00   Median :1.000  
 Mean   :1.373   Mean   : 8.377   Mean   :13.91   Mean   :1.498  
 3rd Qu.:2.000   3rd Qu.:12.000   3rd Qu.:12.00   3rd Qu.:2.000  
 Max.   :4.000   Max.   :99.000   Max.   :99.00   Max.   :9.000  
    unrelat            adults          teens           preteen      
 Min.   :-1.0000   Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:-1.0000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median : 0.0000   Median :2.000   Median :0.0000   Median :0.0000  
 Mean   :-0.1161   Mean   :1.926   Mean   :0.2562   Mean   :0.3313  
 3rd Qu.: 0.0000   3rd Qu.:2.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
 Max.   : 9.0000   Max.   :9.000   Max.   :9.0000   Max.   :9.0000  
     babies           hompop           region         xnorcsiz     
 Min.   :0.0000   Min.   : 1.000   Min.   :1.000   Min.   : 1.000  
 1st Qu.:0.0000   1st Qu.: 2.000   1st Qu.:3.000   1st Qu.: 2.000  
 Median :0.0000   Median : 2.000   Median :5.000   Median : 3.000  
 Mean   :0.2765   Mean   : 2.653   Mean   :4.925   Mean   : 4.359  
 3rd Qu.:0.0000   3rd Qu.: 4.000   3rd Qu.:7.000   3rd Qu.: 6.000  
 Max.   :9.0000   Max.   :99.000   Max.   :9.000   Max.   :10.000  
    srcbelt         localnum        supervis           stress      
 Min.   :1.000   Min.   :0.000   Min.   :0.00000   Min.   :0.0000  
 1st Qu.:3.000   1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000  
 Median :5.000   Median :0.000   Median :0.00000   Median :0.0000  
 Mean   :3.974   Mean   :1.119   Mean   :0.06491   Mean   :0.6447  
 3rd Qu.:5.000   3rd Qu.:2.000   3rd Qu.:0.00000   3rd Qu.:0.0000  
 Max.   :6.000   Max.   :9.000   Max.   :9.00000   Max.   :9.0000  
     satfin          satjob          health          happy      
 Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.000  
 1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:1.000  
 Median :2.000   Median :1.000   Median :2.000   Median :2.000  
 Mean   :1.865   Mean   :1.451   Mean   :1.491   Mean   :1.733  
 3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.000  
 Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   :9.000  
    partyid           size             born           famdif16     
 Min.   :0.000   Min.   :   0.0   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:1.000   1st Qu.:   6.0   1st Qu.:1.0000   1st Qu.:0.0000  
 Median :3.000   Median :  25.0   Median :1.0000   Median :0.0000  
 Mean   :2.781   Mean   : 377.1   Mean   :0.9573   Mean   :0.6506  
 3rd Qu.:5.000   3rd Qu.: 114.0   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :9.000   Max.   :8175.0   Max.   :9.0000   Max.   :9.0000  
    marital         indus10         occ10          wrkslf          evwork      
 Min.   :1.000   Min.   :   0   Min.   :   0   Min.   :0.000   Min.   :0.0000  
 1st Qu.:1.000   1st Qu.:3390   1st Qu.:2310   1st Qu.:2.000   1st Qu.:0.0000  
 Median :1.000   Median :6870   Median :4700   Median :2.000   Median :0.0000  
 Mean   :2.315   Mean   :5755   Mean   :4522   Mean   :1.819   Mean   :0.4117  
 3rd Qu.:3.000   3rd Qu.:8190   3rd Qu.:6230   3rd Qu.:2.000   3rd Qu.:1.0000  
 Max.   :9.000   Max.   :9999   Max.   :9999   Max.   :9.000   Max.   :9.0000  
      hrs2               hrs1          wrkstat           id_      
 Min.   :-1.00000   Min.   :-1.00   Min.   :1.000   Min.   :   1  
 1st Qu.:-1.00000   1st Qu.:-1.00   1st Qu.:1.000   1st Qu.: 507  
 Median :-1.00000   Median :28.00   Median :2.000   Median :1030  
 Mean   :-0.08421   Mean   :23.87   Mean   :3.053   Mean   :1152  
 3rd Qu.:-1.00000   3rd Qu.:40.00   3rd Qu.:5.000   3rd Qu.:1570  
 Max.   :99.00000   Max.   :99.00   Max.   :9.000   Max.   :4510  
    divorce         spwrksta         childs         family16     
 Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :-1.000  
 1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.: 1.000  
 Median :1.000   Median :1.000   Median :2.000   Median : 1.000  
 Mean   :1.146   Mean   :1.536   Mean   :1.962   Mean   : 1.892  
 3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.: 2.000  
 Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   : 9.000  
    mobile16        reg16           res16            race            sex       
 Min.   :0.00   Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.00   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
 Median :2.00   Median :4.000   Median :3.000   Median :1.000   Median :2.000  
 Mean   :1.95   Mean   :4.348   Mean   :3.405   Mean   :1.253   Mean   :1.559  
 3rd Qu.:3.00   3rd Qu.:6.000   3rd Qu.:5.000   3rd Qu.:1.000   3rd Qu.:2.000  
 Max.   :9.00   Max.   :9.000   Max.   :9.000   Max.   :3.000   Max.   :2.000  
     degree           educ           age             year     
 Min.   :0.000   Min.   : 0.0   Min.   :18.00   Min.   :1972  
 1st Qu.:1.000   1st Qu.:12.0   1st Qu.:32.00   1st Qu.:1984  
 Median :1.000   Median :12.0   Median :44.00   Median :1996  
 Mean   :1.379   Mean   :13.1   Mean   :46.29   Mean   :1995  
 3rd Qu.:2.000   3rd Qu.:15.0   3rd Qu.:59.00   3rd Qu.:2006  
 Max.   :9.000   Max.   :99.0   Max.   :99.00   Max.   :2018  
In [75]:
# first, map column values to human readable EXCEPT for continous features listed below in skip_cols
code_mappings = read.table(file='data/code_mappings_tab_delim.txt', sep='\t', header=TRUE)
head(code_mappings)

# loop through each column, and map the code to the text value
# skip listed columns because these have typed answers
library(dplyr)
GSS['jobsat_num'] = GSS['satjob']
skip_cols = data.frame("jobsat_num","year","age","educ","childs","id_","hrs1","hrs2","hompop","babies","teens","preteen","adults","unrelat","earnrs")
columns_gss = names(GSS)

for (i in columns_gss){
    print(i)
    if (i %in% unlist(skip_cols)) {
        print('skipping')
        next
    }
    map_df = data.frame(code_mappings$Code[which(code_mappings$Variable.Name==i)],code_mappings$Label[which(code_mappings$Variable.Name==i)])
    names(map_df)[1] <- i
    names(map_df)[2] <- "to"
    GSS <- merge(x=GSS,y=map_df,by=i,all.x=TRUE,all.y=TRUE)
    GSS[i] = GSS['to']
    GSS['to'] = NULL
    print("done")
}

# make sure it worked
print(head(GSS))
A data.frame: 6 × 4
Variable.PositionVariable.NameCodeLabel
<int><fct><fct><fct>
10ballot 4 Ballot d
20ballot 3 Ballot c
30ballot 2 Ballot b
40ballot 1 Ballot a
50ballot 0 Not applicable
61rincome99No answer
[1] "ballot"
[1] "done"
[1] "rincome"
[1] "done"
[1] "income"
[1] "done"
[1] "earnrs"
[1] "skipping"
[1] "unrelat"
[1] "skipping"
[1] "adults"
[1] "skipping"
[1] "teens"
[1] "skipping"
[1] "preteen"
[1] "skipping"
[1] "babies"
[1] "skipping"
[1] "hompop"
[1] "skipping"
[1] "region"
[1] "done"
[1] "xnorcsiz"
[1] "done"
[1] "srcbelt"
[1] "done"
[1] "localnum"
[1] "done"
[1] "supervis"
[1] "done"
[1] "stress"
[1] "done"
[1] "satfin"
[1] "done"
[1] "satjob"
[1] "done"
[1] "health"
[1] "done"
[1] "happy"
[1] "done"
[1] "partyid"
[1] "done"
[1] "size"
[1] "done"
[1] "born"
[1] "done"
[1] "famdif16"
[1] "done"
[1] "marital"
[1] "done"
[1] "indus10"
[1] "done"
[1] "occ10"
[1] "done"
[1] "wrkslf"
[1] "done"
[1] "evwork"
[1] "done"
[1] "hrs2"
[1] "skipping"
[1] "hrs1"
[1] "skipping"
[1] "wrkstat"
[1] "done"
[1] "id_"
[1] "skipping"
[1] "divorce"
[1] "done"
[1] "spwrksta"
[1] "done"
[1] "childs"
[1] "skipping"
[1] "family16"
[1] "done"
[1] "mobile16"
[1] "done"
[1] "reg16"
[1] "done"
[1] "res16"
[1] "done"
[1] "race"
[1] "done"
[1] "sex"
[1] "done"
[1] "degree"
[1] "done"
[1] "educ"
[1] "skipping"
[1] "age"
[1] "skipping"
[1] "year"
[1] "skipping"
[1] "jobsat_num"
[1] "skipping"
            degree    sex  race          res16           reg16        mobile16
1 1 Lt high school   Male White  Town lt 50000     New england       Same city
2 1 Lt high school Female White           Farm W  sou  central       Same city
3 1 Lt high school   Male White  Town lt 50000         Foreign Different state
4 1 Lt high school Female Black City gt 250000         Pacific       Same city
5 1 Lt high school Female White  Town lt 50000     New england       Same city
6 1 Lt high school   Male White City gt 250000        Mountain Different state
               family16         spwrksta        divorce          wrkstat
1    Mother  and father    Keeping house             No Working fulltime
2                 Other   Not applicable            Yes          Retired
3                Mother          Retired             No          Retired
4                Father   Not applicable Not applicable    Keeping house
5    Mother  and father   Not applicable             No Working fulltime
6 Mother  and stpfather Working fulltime            Yes Unempl  laid off
          evwork         wrkslf                                  occ10
1 Not applicable   Someone else         Janitors and building cleaners
2            Yes   Someone else        Maids and housekeeping cleaners
3            Yes   Someone else                             Carpenters
4             No Not applicable                         Not applicable
5 Not applicable   Someone else               Food preparation workers
6 Not applicable   Someone else Driver sales workers and truck drivers
                                               indus10       marital
1 Colleges and universities  including junior colleges       Married
2                                            Hospitals       Widowed
3                 Motion pictures and video industries       Married
4                                       Not applicable Never married
5         Residential care facilities  without nursing       Widowed
6                          Other consumer goods rental       Married
           famdif16 born size            partyid           happy         health
1    Not applicable  Yes <NA>    Strong democrat  2 Pretty happy           Good
2             Other  Yes <NA>   Not str democrat    3 Very happy Not applicable
3       Parent died   No <NA> Not str republican 1 Not too happy      Excellent
4 Divorce separated  Yes <NA>    Strong democrat  2 Pretty happy           Fair
5    Not applicable  Yes <NA>        Independent  2 Pretty happy           Good
6 Divorce separated  Yes <NA>   Not str democrat    3 Very happy Not applicable
            satjob           satfin      stress supervis       localnum
1 3 Mod  satisfied   2 More or less     No issp  No issp Not applicable
2   Not applicable      3 Satisfied     No issp  No issp Not applicable
3   Not applicable 1 Not at all sat     No issp  No issp Not applicable
4 4 Very satisfied 1 Not at all sat     No issp  No issp Not applicable
5 4 Very satisfied   2 More or less 3 Sometimes  No issp     b 10 to 49
6 4 Very satisfied 1 Not at all sat     No issp  No issp Not applicable
           srcbelt         xnorcsiz          region          income
1 Suburb  12 lrgst Suburb  lrg city     New england l 25000 or more
2      Other urban Suburb  lrg city W  sou  central  h 8000 to 9999
3 Suburb  12 lrgst   Uninc lrg city Middle atlantic  g 7000 to 7999
4    Smsas 13to100  City3 gt 250000         Pacific      a Lt  1000
5  Suburb  13to100   Uninc med city     New england l 25000 or more
6    Smsas 13to100  City3 gt 250000        Mountain  f 6000 to 6999
           rincome         ballot earnrs unrelat adults teens preteen babies
1 j 15000 to 19999 Not applicable      2       0      2     0       0      0
2        No answer       Ballot b      0      -1      1     0       0      0
3   e 5000 to 5999 Not applicable      0       0      2     0       0      0
4   Not applicable       Ballot c      1       0      1     1       0      0
5  l 25000 or more       Ballot a      1       0      1     1       0      0
6   b 1000 to 2999       Ballot b      2       0      2     0       0      0
  hompop hrs2 hrs1  id_ childs educ age year jobsat_num
1      2   -1   40  259      1   10  56 1984          2
2      1   -1   -1 1965      4    8  80 1994          0
3      2   -1   -1  802      3   10  79 1980          0
4      2   -1   -1  197      8   11  48 2000          1
5      2   -1   35 1394      4   10  71 2014          1
6      2   -1   -1 1358      1    8  36 1993          1
In [76]:
# # change column names using:
column_name_mappings = read.table(file='data/column_name_mapping_tab_delim.txt', sep='\t', header=TRUE)
head(column_name_mappings)

# loop through each column, and map the code to the text value
columns_gss = names(GSS)

for (i in columns_gss){
    new_column_name = toString(column_name_mappings$Label[which(column_name_mappings$Name==i)][1])
    names(GSS)[names(GSS) == i] <- new_column_name
    
}

print(head(GSS))
A data.frame: 6 × 4
NameLabelDescriptionCategory
<fct><fct><lgl><lgl>
1ballot Ballot used for interview NANA
2rincomeRespondents income NANA
3income Total family income NANA
4earnrs How many in family earned money NANA
5unrelatNumber in household not related NANA
6adults Household members 18 yrs and olderNANA
  Rs highest degree Respondents sex Race of respondent
1  1 Lt high school            Male              White
2  1 Lt high school          Female              White
3  1 Lt high school            Male              White
4  1 Lt high school          Female              Black
5  1 Lt high school          Female              White
6  1 Lt high school            Male              White
  Type of place lived in when 16 yrs old Region of residence, age 16
1                          Town lt 50000                 New england
2                                   Farm             W  sou  central
3                          Town lt 50000                     Foreign
4                         City gt 250000                     Pacific
5                          Town lt 50000                 New england
6                         City gt 250000                    Mountain
  Geographic mobility since age 16 Living with parents when 16 yrs old
1                        Same city                  Mother  and father
2                        Same city                               Other
3                  Different state                              Mother
4                        Same city                              Father
5                        Same city                  Mother  and father
6                  Different state               Mother  and stpfather
  Spouse labor force status Ever been divorced or separated Labor force status
1             Keeping house                              No   Working fulltime
2            Not applicable                             Yes            Retired
3                   Retired                              No            Retired
4            Not applicable                  Not applicable      Keeping house
5            Not applicable                              No   Working fulltime
6          Working fulltime                             Yes   Unempl  laid off
  Ever work as long as one year R self-emp or works for somebody
1                Not applicable                     Someone else
2                           Yes                     Someone else
3                           Yes                     Someone else
4                            No                   Not applicable
5                Not applicable                     Someone else
6                Not applicable                     Someone else
        Rs census occupation code (2010)
1         Janitors and building cleaners
2        Maids and housekeeping cleaners
3                             Carpenters
4                         Not applicable
5               Food preparation workers
6 Driver sales workers and truck drivers
                         Rs industry code (naics 2007) Marital status
1 Colleges and universities  including junior colleges        Married
2                                            Hospitals        Widowed
3                 Motion pictures and video industries        Married
4                                       Not applicable  Never married
5         Residential care facilities  without nursing        Widowed
6                          Other consumer goods rental        Married
  Reason not living with parents Was r born in this country
1                 Not applicable                        Yes
2                          Other                        Yes
3                    Parent died                         No
4              Divorce separated                        Yes
5                 Not applicable                        Yes
6              Divorce separated                        Yes
  Size of place in 1000s Political party affiliation General happiness
1                   <NA>             Strong democrat    2 Pretty happy
2                   <NA>            Not str democrat      3 Very happy
3                   <NA>          Not str republican   1 Not too happy
4                   <NA>             Strong democrat    2 Pretty happy
5                   <NA>                 Independent    2 Pretty happy
6                   <NA>            Not str democrat      3 Very happy
  Condition of health Job or housework Satisfaction with financial situation
1                Good 3 Mod  satisfied                        2 More or less
2      Not applicable   Not applicable                           3 Satisfied
3           Excellent   Not applicable                      1 Not at all sat
4                Fair 4 Very satisfied                      1 Not at all sat
5                Good 4 Very satisfied                        2 More or less
6      Not applicable 4 Very satisfied                      1 Not at all sat
  How often does r find work stressful
1                              No issp
2                              No issp
3                              No issp
4                              No issp
5                          3 Sometimes
6                              No issp
  Does r supervise others at work in main job Number of employees: rs work site
1                                     No issp                    Not applicable
2                                     No issp                    Not applicable
3                                     No issp                    Not applicable
4                                     No issp                    Not applicable
5                                     No issp                        b 10 to 49
6                                     No issp                    Not applicable
      Src beltcode Expanded norc. size code Region of interview
1 Suburb  12 lrgst         Suburb  lrg city         New england
2      Other urban         Suburb  lrg city     W  sou  central
3 Suburb  12 lrgst           Uninc lrg city     Middle atlantic
4    Smsas 13to100          City3 gt 250000             Pacific
5  Suburb  13to100           Uninc med city         New england
6    Smsas 13to100          City3 gt 250000            Mountain
  Total family income Respondents income Ballot used for interview
1     l 25000 or more   j 15000 to 19999            Not applicable
2      h 8000 to 9999          No answer                  Ballot b
3      g 7000 to 7999     e 5000 to 5999            Not applicable
4          a Lt  1000     Not applicable                  Ballot c
5     l 25000 or more    l 25000 or more                  Ballot a
6      f 6000 to 6999     b 1000 to 2999                  Ballot b
  How many in family earned money Number in household not related
1                               2                               0
2                               0                              -1
3                               0                               0
4                               1                               0
5                               1                               0
6                               2                               0
  Household members 18 yrs and older Household members 13 thru 17 yrs old
1                                  2                                    0
2                                  1                                    0
3                                  2                                    0
4                                  1                                    1
5                                  1                                    1
6                                  2                                    0
  Household members 6 thru 12 yrs old Household members less than 6 yrs old
1                                   0                                     0
2                                   0                                     0
3                                   0                                     0
4                                   0                                     0
5                                   0                                     0
6                                   0                                     0
  Number of persons in household Number of hours usually work a week
1                              2                                  -1
2                              1                                  -1
3                              2                                  -1
4                              2                                  -1
5                              2                                  -1
6                              2                                  -1
  Number of hours worked last week Respondent id number Number of children
1                               40                  259                  1
2                               -1                 1965                  4
3                               -1                  802                  3
4                               -1                  197                  8
5                               35                 1394                  4
6                               -1                 1358                  1
  Highest year of school completed Age of respondent
1                               10                56
2                                8                80
3                               10                79
4                               11                48
5                               10                71
6                                8                36
  Gss year for this respondent                        NA
1                                                1984  2
2                                                1994  0
3                                                1980  0
4                                                2000  1
5                                                2014  1
6                                                1993  1
In [77]:
# split out factors from continuous predictors:

# predictors - factors
predictors_factors = data.frame(
    "Was r born in this country",
    "Region of interview",
    "Does r supervise others at work in main job",
    "Political party affiliation",
    "Reason not living with parents",
    "Living with parents when 16 yrs old",
    "Geographic mobility since age 16",
    "Condition of health",
    "How often does r find work stressful",
    "Region of residence, age 16",
    "Type of place lived in when 16 yrs old",
    "Rs industry code (naics 2007)",
    "Rs census occupation code (2010)",
    "R self-emp or works for somebody",
    "Ever work as long as one year",
    "Labor force status",
    "Marital status",
    "Ever been divorced or separated",
    "Spouse labor force status",
    "Race of respondent",
    "Respondents sex",
    "Rs highest degree",
    "Respondents income",
    "Total family income",
    "Number of employees: rs work site",
    "Size of place in 1000s"
)

continuous_factors = data.frame(
    "Age of respondent",
    "How many in family earned money",
    "Number in household not related",
    "Household members 18 yrs and older",
    "Household members 13 thru 17 yrs old",
    "Household members 6 thru 12 yrs old",
    "Household members less than 6 yrs old",
    "Number of persons in household",
    "Number of hours usually work a week",
    "Number of hours worked last week",
    "Number of children",
    "Highest year of school completed"
)

# time series fields
time_fields = data.frame("Gss year for this respondent")

id_fields = data.frame("Respondent id number")

# responses
responses = data.frame(
    "Job or housework", # satisfaction with job
    "Satisfaction with financial situation", 
    "General happiness"
    )


# unknown:
# - Src beltcode
# - Expanded norc  size code
In [78]:
GSS['counter'] = 1
In [79]:
# turn into NANs: "Not applicable" and "No answer" for all columns
library(dplyr)
GSS = na_if(GSS, "Not applicable")
GSS = na_if(GSS, "No answer")
GSS = na_if(GSS, "Dont know")
GSS = na_if(GSS, "Refused")
GSS = na_if(GSS, "No issp")
GSS = na_if(GSS, "Cant choose")

# turn into NANs: 
## 0 age, negative values in any of the continuous_factors
## 9, 99 or 98 in any of the continuous variables - these values map to 'No answer' and 'Dont know'
## Important: this cannot be done for Respondent ID because that makes null IDS, which is not valid

col_names = names(GSS)

for (i in col_names) {
    colname = toString(i)
    if(colname == 'Respondent id number'){
        next
    }
    GSS[colname] = na_if(GSS[colname], "9")
    GSS[colname] = na_if(GSS[colname], "99")
    GSS[colname] = na_if(GSS[colname], "98")
    GSS[colname] = na_if(GSS[colname], "13")
}

GSS$"Age of respondent" = na_if(GSS$"Age of respondent",0)

library(expss) # for using the lt() function below, means "less than"
for (i in continuous_factors) {
    colname = toString(i)
    GSS[colname] = na_if(GSS[colname], lt(0))
}

# remove NA responses
GSS = GSS[!is.na(GSS$"Job or housework"), ]

# Look at that cleaned data!
head(GSS)
A data.frame: 6 × 48
Rs highest degreeRespondents sexRace of respondentType of place lived in when 16 yrs oldRegion of residence, age 16Geographic mobility since age 16Living with parents when 16 yrs oldSpouse labor force statusEver been divorced or separatedLabor force statusNumber of persons in householdNumber of hours usually work a weekNumber of hours worked last weekRespondent id numberNumber of childrenHighest year of school completedAge of respondentGss year for this respondent NAcounter
<fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><int><int><int><int><int><int><int><int><int><dbl>
11 Lt high schoolMale WhiteTown lt 50000 New england Same city Mother and father Keeping house No Working fulltime2NA40 25911056198421
41 Lt high schoolFemaleBlackCity gt 250000Pacific Same city Father NA NA Keeping house 2NANA 19781148200011
51 Lt high schoolFemaleWhiteTown lt 50000 New england Same city Mother and father NA No Working fulltime2NA35139441071201411
61 Lt high schoolMale WhiteCity gt 250000Mountain Different stateMother and stpfatherWorking fulltimeYesUnempl laid off2NANA13581 836199311
91 Lt high schoolMale WhiteTown lt 50000 W sou centralSame city Father and stpmotherNA NA Keeping house 6NANA 84621022198421
101 Lt high schoolMale WhiteFarm E sou centralSame city M and f relatives NA NA Working fulltime1NA42134251153198741
In [80]:
library(ggplot2)

# make the graphs small
options(repr.plot.width=5, repr.plot.height=5)

# counts
ggplot(GSS, aes(x=GSS$'Job or housework',na.rm = TRUE)) +
  geom_bar( na.rm = TRUE)+ 
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Distribution of job satisfaction")
In [81]:
# convert factors predictors into factors
for (i in predictors_factors){
    colname = toString(i)
    GSS[[colname]] = factor(GSS[[colname]])
}


# take a look at them
library(ggplot2)
library(Hmisc)

options(repr.plot.width=4, repr.plot.height=4)

for (i in predictors_factors){
  print(i)
  plot = ggplot(GSS, aes(x=GSS[[toString(i)]])) +
  geom_bar()+ 
  xlab(i) + 
  scale_x_discrete(label=abbreviate)+
  theme(axis.text.x = element_text(angle = 90))
    
  print(plot)
}
[1] Was r born in this country
Levels: Was r born in this country
[1] Region of interview
Levels: Region of interview
[1] Does r supervise others at work in main job
Levels: Does r supervise others at work in main job
[1] Political party affiliation
Levels: Political party affiliation
[1] Reason not living with parents
Levels: Reason not living with parents
[1] Living with parents when 16 yrs old
Levels: Living with parents when 16 yrs old
[1] Geographic mobility since age 16
Levels: Geographic mobility since age 16
[1] Condition of health
Levels: Condition of health
[1] How often does r find work stressful
Levels: How often does r find work stressful
[1] Region of residence, age 16
Levels: Region of residence, age 16
[1] Type of place lived in when 16 yrs old
Levels: Type of place lived in when 16 yrs old
[1] Rs industry code (naics 2007)
Levels: Rs industry code (naics 2007)
[1] Rs census occupation code (2010)
Levels: Rs census occupation code (2010)
[1] R self-emp or works for somebody
Levels: R self-emp or works for somebody
[1] Ever work as long as one year
Levels: Ever work as long as one year
[1] Labor force status
Levels: Labor force status
[1] Marital status
Levels: Marital status
[1] Ever been divorced or separated
Levels: Ever been divorced or separated
[1] Spouse labor force status
Levels: Spouse labor force status
[1] Race of respondent
Levels: Race of respondent
[1] Respondents sex
Levels: Respondents sex
[1] Rs highest degree
Levels: Rs highest degree
[1] Respondents income
Levels: Respondents income
[1] Total family income
Levels: Total family income
[1] Number of employees: rs work site
Levels: Number of employees: rs work site
[1] Size of place in 1000s
Levels: Size of place in 1000s
In [82]:
# drop unused levels to clean up
GSS = droplevels(GSS)

# review predictors
summary(GSS$"Job or housework")
summary(GSS$"Satisfaction with financial situation")
summary(GSS$"General happiness")
1 Very dissatisfied
1939
2 A little dissat
4611
3 Mod satisfied
17723
4 Very satisfied
22417
1 Not at all sat
12640
2 More or less
20892
3 Satisfied
12635
NA's
523
1 Not too happy
5462
2 Pretty happy
26179
3 Very happy
14410
NA's
639
In [83]:
# Plot continuous predictors

for (i in continuous_factors) {
    print(i)
    colname = toString(i)
    print(summary(GSS[colname]))
    print(hist(GSS[colname]))
}
[1] Age of respondent
Levels: Age of respondent
 Age of respondent
 Min.   :18.00    
 1st Qu.:30.00    
 Median :40.00    
 Mean   :41.92    
 3rd Qu.:52.00    
 Max.   :89.00    
 NA's   :142      
[1] 1
[1] How many in family earned money
Levels: How many in family earned money
 How many in family earned money
 Min.   :0.000                  
 1st Qu.:1.000                  
 Median :2.000                  
 Mean   :1.632                  
 3rd Qu.:2.000                  
 Max.   :8.000                  
 NA's   :346                    
[1] 1
[1] Number in household not related
Levels: Number in household not related
 Number in household not related
 Min.   :0.000                  
 1st Qu.:0.000                  
 Median :0.000                  
 Mean   :0.141                  
 3rd Qu.:0.000                  
 Max.   :8.000                  
 NA's   :10924                  
[1] 1
[1] Household members 18 yrs and older
Levels: Household members 18 yrs and older
 Household members 18 yrs and older
 Min.   :1.00                      
 1st Qu.:1.00                      
 Median :2.00                      
 Mean   :1.96                      
 3rd Qu.:2.00                      
 Max.   :8.00                      
 NA's   :56                        
[1] 1
[1] Household members 13 thru 17 yrs old
Levels: Household members 13 thru 17 yrs old
 Household members 13 thru 17 yrs old
 Min.   :0.0000                      
 1st Qu.:0.0000                      
 Median :0.0000                      
 Mean   :0.2496                      
 3rd Qu.:0.0000                      
 Max.   :7.0000                      
 NA's   :251                         
[1] 1
[1] Household members 6 thru 12 yrs old
Levels: Household members 6 thru 12 yrs old
 Household members 6 thru 12 yrs old
 Min.   :0.0000                     
 1st Qu.:0.0000                     
 Median :0.0000                     
 Mean   :0.3306                     
 3rd Qu.:0.0000                     
 Max.   :8.0000                     
 NA's   :322                        
[1] 1
[1] Household members less than 6 yrs old
Levels: Household members less than 6 yrs old
 Household members less than 6 yrs old
 Min.   :0.0000                       
 1st Qu.:0.0000                       
 Median :0.0000                       
 Mean   :0.2687                       
 3rd Qu.:0.0000                       
 Max.   :6.0000                       
 NA's   :300                          
[1] 1
[1] Number of persons in household
Levels: Number of persons in household
 Number of persons in household
 Min.   : 1.000                
 1st Qu.: 2.000                
 Median : 2.000                
 Mean   : 2.802                
 3rd Qu.: 4.000                
 Max.   :16.000                
 NA's   :114                   
[1] 1
[1] Number of hours usually work a week
Levels: Number of hours usually work a week
 Number of hours usually work a week
 Min.   : 0.00                      
 1st Qu.:35.00                      
 Median :40.00                      
 Mean   :39.17                      
 3rd Qu.:45.00                      
 Max.   :89.00                      
 NA's   :45562                      
[1] 1
[1] Number of hours worked last week
Levels: Number of hours worked last week
 Number of hours worked last week
 Min.   : 0.00                   
 1st Qu.:37.00                   
 Median :40.00                   
 Mean   :41.34                   
 3rd Qu.:48.00                   
 Max.   :89.00                   
 NA's   :12110                   
[1] 1
[1] Number of children
Levels: Number of children
 Number of children
 Min.   :0.000     
 1st Qu.:0.000     
 Median :2.000     
 Mean   :1.868     
 3rd Qu.:3.000     
 Max.   :8.000     
 NA's   :114       
[1] 1
[1] Highest year of school completed
Levels: Highest year of school completed
 Highest year of school completed
 Min.   : 0.00                   
 1st Qu.:12.00                   
 Median :12.00                   
 Mean   :13.23                   
 3rd Qu.:16.00                   
 Max.   :20.00                   
 NA's   :5500                    
[1] 1
In [59]:
ggplot(GSS, aes(x=`Number of hours usually work a week`)) + geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning message:
“Removed 45562 rows containing non-finite values (stat_bin).”
In [60]:
# Contigency tables

# Age by job satisfaction
xtabs(counter ~ `Job or housework`+`Age of respondent`, data=GSS)
                     Age of respondent
Job or housework       18  19  20  21  22  23  24  25  26  27  28  29  30  31
  1 Very dissatisfied   6  44  53  45  54  51  65  59  64  48  67  51  65  52
  2 A little dissat    19  93 101 109 108 139 148 148 155 128 159 140 131 131
  3 Mod  satisfied     49 232 252 324 367 437 404 513 470 542 500 476 520 505
  4 Very satisfied     31 134 200 250 259 356 379 461 463 502 508 491 537 505
                     Age of respondent
Job or housework       32  33  34  35  36  37  38  39  40  41  42  43  44  45
  1 Very dissatisfied  55  57  50  52  44  44  46  34  44  42  52  49  43  38
  2 A little dissat   128 125  94 122 140 137 125 102 108 102  94  94  90  97
  3 Mod  satisfied    533 474 490 508 448 451 457 415 430 405 397 428 360 356
  4 Very satisfied    538 532 612 526 563 569 568 541 532 519 523 521 523 458
                     Age of respondent
Job or housework       46  47  48  49  50  51  52  53  54  55  56  57  58  59
  1 Very dissatisfied  32  30  36  35  45  36  32  35  36  35  20  18  18  22
  2 A little dissat    76  87  85  81  74  72  81  72  74  63  58  51  46  61
  3 Mod  satisfied    368 353 342 346 322 329 300 288 272 243 269 250 244 217
  4 Very satisfied    488 446 451 490 446 468 422 459 406 388 460 364 419 367
                     Age of respondent
Job or housework       60  61  62  63  64  65  66  67  68  69  70  71  72  73
  1 Very dissatisfied  17  12  12  12   9   8   7   7   6   4   3   3   3   3
  2 A little dissat    51  40  32  38  17  19  12  18  17  12  11  11  13   9
  3 Mod  satisfied    226 193 166 128  97 122  96  81  77  71  62  52  36  39
  4 Very satisfied    360 329 307 270 247 239 179 175 162 137 142 138 116 109
                     Age of respondent
Job or housework       74  75  76  77  78  79  80  81  82  83  84  85  86  87
  1 Very dissatisfied   3   1   0   4   2   3   2   0   0   1   0   1   1   1
  2 A little dissat     8   2   6   2   4   4   3   6   2   0   2   2   2   2
  3 Mod  satisfied     44  34  37  43  36  23  17  18   8  14  15  10   8  12
  4 Very satisfied     97  80  83  78  67  58  52  53  41  30  20  21  22  16
                     Age of respondent
Job or housework       88  89
  1 Very dissatisfied   0   2
  2 A little dissat     3   3
  3 Mod  satisfied      5  15
  4 Very satisfied     13  31

No rows removed (yet)

We changed the values of "Not applicable" and "No Answer" and "Dont know" to R's encoded NAs

Regression sandbox

In [61]:
#*** fit ordinal logistic regression on a few easy variables just to see if it's the right direction
# not final model, just a sandbox

library(MASS)

olrmod_job <- polr(`Job or housework` ~ 
                   `R self-emp or works for somebody` + 
                   `Respondents income` + 
                   `How often does r find work stressful` + 
                   `Number of hours usually work a week`, data = GSS, Hess = TRUE)
summary(olrmod_job)
Call:
polr(formula = `Job or housework` ~ `R self-emp or works for somebody` + 
    `Respondents income` + `How often does r find work stressful` + 
    `Number of hours usually work a week`, data = GSS, Hess = TRUE)

Coefficients:
                                                       Value Std. Error
`R self-emp or works for somebody`Someone else      -0.07167   0.359807
`Respondents income`b 1000 to 2999                  -4.38426   1.321731
`Respondents income`c 3000 to 3999                  -1.82402   1.472220
`Respondents income`d 4000 to 4999                  -2.80797   1.448168
`Respondents income`e 5000 to 5999                   5.71478   0.003276
`Respondents income`f 6000 to 6999                  -4.39982   1.537370
`Respondents income`g 7000 to 7999                  -2.22543   1.565071
`Respondents income`h 8000 to 9999                  -2.20930   1.749456
`Respondents income`i 10000 to 14999                -2.15287   1.195865
`Respondents income`j 15000 to 19999                -3.24131   1.236840
`Respondents income`k 20000 to 24999                -2.15458   1.220734
`Respondents income`l 25000 or more                 -2.39376   1.166752
`How often does r find work stressful`2 Hardly ever  0.19317   0.601780
`How often does r find work stressful`3 Sometimes   -0.75861   0.543516
`How often does r find work stressful`4 Often       -0.89656   0.561932
`How often does r find work stressful`5 Always      -2.17896   0.604837
`Number of hours usually work a week`                0.01067   0.010026
                                                      t value
`R self-emp or works for somebody`Someone else        -0.1992
`Respondents income`b 1000 to 2999                    -3.3171
`Respondents income`c 3000 to 3999                    -1.2390
`Respondents income`d 4000 to 4999                    -1.9390
`Respondents income`e 5000 to 5999                  1744.4375
`Respondents income`f 6000 to 6999                    -2.8619
`Respondents income`g 7000 to 7999                    -1.4219
`Respondents income`h 8000 to 9999                    -1.2629
`Respondents income`i 10000 to 14999                  -1.8003
`Respondents income`j 15000 to 19999                  -2.6206
`Respondents income`k 20000 to 24999                  -1.7650
`Respondents income`l 25000 or more                   -2.0516
`How often does r find work stressful`2 Hardly ever    0.3210
`How often does r find work stressful`3 Sometimes     -1.3957
`How often does r find work stressful`4 Often         -1.5955
`How often does r find work stressful`5 Always        -3.6026
`Number of hours usually work a week`                  1.0645

Intercepts:
                                      Value     Std. Error t value  
1 Very dissatisfied|2 A little dissat   -6.1002    1.3203    -4.6203
2 A little dissat|3 Mod  satisfied      -4.6552    1.2946    -3.5958
3 Mod  satisfied|4 Very satisfied       -2.7225    1.2763    -2.1330

Residual Deviance: 571.7883 
AIC: 611.7883 
(46414 observations deleted due to missingness)
In [62]:
#*** p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_job))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
A matrix: 20 × 4 of type dbl
ValueStd. Errort valuep value
`R self-emp or works for somebody`Someone else-0.071673760.359807167 -0.19920050.842
`Respondents income`b 1000 to 2999-4.384258501.321731348 -3.31705720.001
`Respondents income`c 3000 to 3999-1.824017021.472219851 -1.23895690.215
`Respondents income`d 4000 to 4999-2.807969391.448167979 -1.93898040.053
`Respondents income`e 5000 to 5999 5.714781080.0032760021744.43752100.000
`Respondents income`f 6000 to 6999-4.399822321.537370222 -2.86191460.004
`Respondents income`g 7000 to 7999-2.225427961.565070960 -1.42193420.155
`Respondents income`h 8000 to 9999-2.209301621.749455673 -1.26285090.207
`Respondents income`i 10000 to 14999-2.152865431.195864798 -1.80025820.072
`Respondents income`j 15000 to 19999-3.241314171.236839561 -2.62064240.009
`Respondents income`k 20000 to 24999-2.154579951.220734036 -1.76498720.078
`Respondents income`l 25000 or more-2.393760591.166752242 -2.05164430.040
`How often does r find work stressful`2 Hardly ever 0.193173630.601779685 0.32100390.748
`How often does r find work stressful`3 Sometimes-0.758610710.543516061 -1.39574660.163
`How often does r find work stressful`4 Often-0.896556530.561932370 -1.59548830.111
`How often does r find work stressful`5 Always-2.178957690.604836910 -3.60255410.000
`Number of hours usually work a week` 0.010672610.010026390 1.06445140.287
1 Very dissatisfied|2 A little dissat-6.100193141.320300434 -4.62030680.000
2 A little dissat|3 Mod satisfied-4.655192561.294605356 -3.59583910.000
3 Mod satisfied|4 Very satisfied-2.722460891.276348785 -2.13300700.033
In [63]:
# happiness response with job predictors
library(MASS)

olr_happiness_job <- polr(`General happiness` ~ 
                   `R self-emp or works for somebody` + 
                   `Respondents income` + 
                   `How often does r find work stressful` + 
                   `Number of hours usually work a week`, data = GSS, Hess = TRUE)
print(summary(olr_happiness_job))

#*** p values for ordinal logistic regression -- THIS WORKS
summary_table <- coef(summary(olr_happiness_job))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
print(summary_table)
Call:
polr(formula = `General happiness` ~ `R self-emp or works for somebody` + 
    `Respondents income` + `How often does r find work stressful` + 
    `Number of hours usually work a week`, data = GSS, Hess = TRUE)

Coefficients:
                                                        Value Std. Error
`R self-emp or works for somebody`Someone else      -0.514320    0.38389
`Respondents income`b 1000 to 2999                   0.013288    1.19139
`Respondents income`c 3000 to 3999                   1.374948    1.19564
`Respondents income`d 4000 to 4999                   0.651855    1.13768
`Respondents income`e 5000 to 5999                  -0.165071    1.99002
`Respondents income`f 6000 to 6999                   2.896453    1.42236
`Respondents income`g 7000 to 7999                   1.164399    1.31845
`Respondents income`h 8000 to 9999                  -1.771299    1.48845
`Respondents income`i 10000 to 14999                 0.587909    0.88631
`Respondents income`j 15000 to 19999                 0.400055    0.94461
`Respondents income`k 20000 to 24999                 1.180656    0.91359
`Respondents income`l 25000 or more                  0.860033    0.85147
`How often does r find work stressful`2 Hardly ever -0.632779    0.62336
`How often does r find work stressful`3 Sometimes   -0.792711    0.58911
`How often does r find work stressful`4 Often       -0.849943    0.60750
`How often does r find work stressful`5 Always      -1.693441    0.64700
`Number of hours usually work a week`                0.009086    0.01053
                                                     t value
`R self-emp or works for somebody`Someone else      -1.33977
`Respondents income`b 1000 to 2999                   0.01115
`Respondents income`c 3000 to 3999                   1.14996
`Respondents income`d 4000 to 4999                   0.57297
`Respondents income`e 5000 to 5999                  -0.08295
`Respondents income`f 6000 to 6999                   2.03637
`Respondents income`g 7000 to 7999                   0.88316
`Respondents income`h 8000 to 9999                  -1.19003
`Respondents income`i 10000 to 14999                 0.66332
`Respondents income`j 15000 to 19999                 0.42351
`Respondents income`k 20000 to 24999                 1.29233
`Respondents income`l 25000 or more                  1.01006
`How often does r find work stressful`2 Hardly ever -1.01511
`How often does r find work stressful`3 Sometimes   -1.34562
`How often does r find work stressful`4 Often       -1.39908
`How often does r find work stressful`5 Always      -2.61739
`Number of hours usually work a week`                0.86313

Intercepts:
                               Value   Std. Error t value
1 Not too happy|2 Pretty happy -1.9035  1.0164    -1.8729
2 Pretty happy|3 Very happy     0.8720  1.0114     0.8621

Residual Deviance: 493.2086 
AIC: 531.2086 
(46425 observations deleted due to missingness)
                                                          Value Std. Error
`R self-emp or works for somebody`Someone else      -0.51431960 0.38388778
`Respondents income`b 1000 to 2999                   0.01328817 1.19139367
`Respondents income`c 3000 to 3999                   1.37494805 1.19564459
`Respondents income`d 4000 to 4999                   0.65185549 1.13768052
`Respondents income`e 5000 to 5999                  -0.16507067 1.99002382
`Respondents income`f 6000 to 6999                   2.89645276 1.42236045
`Respondents income`g 7000 to 7999                   1.16439885 1.31844770
`Respondents income`h 8000 to 9999                  -1.77129912 1.48844895
`Respondents income`i 10000 to 14999                 0.58790896 0.88630911
`Respondents income`j 15000 to 19999                 0.40005489 0.94461105
`Respondents income`k 20000 to 24999                 1.18065628 0.91359018
`Respondents income`l 25000 or more                  0.86003295 0.85146651
`How often does r find work stressful`2 Hardly ever -0.63277946 0.62335831
`How often does r find work stressful`3 Sometimes   -0.79271058 0.58910587
`How often does r find work stressful`4 Often       -0.84994307 0.60749985
`How often does r find work stressful`5 Always      -1.69344100 0.64699669
`Number of hours usually work a week`                0.00908618 0.01052696
1 Not too happy|2 Pretty happy                      -1.90350120 1.01635387
2 Pretty happy|3 Very happy                          0.87197560 1.01140315
                                                        t value p value
`R self-emp or works for somebody`Someone else      -1.33976549   0.180
`Respondents income`b 1000 to 2999                   0.01115347   0.991
`Respondents income`c 3000 to 3999                   1.14996385   0.250
`Respondents income`d 4000 to 4999                   0.57296885   0.567
`Respondents income`e 5000 to 5999                  -0.08294909   0.934
`Respondents income`f 6000 to 6999                   2.03637043   0.042
`Respondents income`g 7000 to 7999                   0.88315892   0.377
`Respondents income`h 8000 to 9999                  -1.19003014   0.234
`Respondents income`i 10000 to 14999                 0.66332271   0.507
`Respondents income`j 15000 to 19999                 0.42351282   0.672
`Respondents income`k 20000 to 24999                 1.29232593   0.196
`Respondents income`l 25000 or more                  1.01006081   0.312
`How often does r find work stressful`2 Hardly ever -1.01511354   0.310
`How often does r find work stressful`3 Sometimes   -1.34561650   0.178
`How often does r find work stressful`4 Often       -1.39908360   0.162
`How often does r find work stressful`5 Always      -2.61738741   0.009
`Number of hours usually work a week`                0.86313421   0.388
1 Not too happy|2 Pretty happy                      -1.87287248   0.061
2 Pretty happy|3 Very happy                          0.86214445   0.389
In [64]:
# happiness response with age predictor, just out of curiousity
# happiness response with job predictors
library(MASS)

olr_happiness_age <- polr(`General happiness` ~ 
                   `Age of respondent`, data = GSS, Hess = TRUE)
print(summary(olr_happiness_age))

#*** p values for ordinal logistic regression -- THIS WORKS
summary_table <- coef(summary(olr_happiness_age))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
print(summary_table)
Call:
polr(formula = `General happiness` ~ `Age of respondent`, data = GSS, 
    Hess = TRUE)

Coefficients:
                       Value Std. Error t value
`Age of respondent` 0.005666  0.0006363   8.905

Intercepts:
                               Value    Std. Error t value 
1 Not too happy|2 Pretty happy  -1.7714   0.0299   -59.2942
2 Pretty happy|3 Very happy      1.0244   0.0286    35.7738

Residual Deviance: 85976.54 
AIC: 85982.54 
(775 observations deleted due to missingness)
                                     Value   Std. Error    t value p value
`Age of respondent`             0.00566608 0.0006362597   8.905295       0
1 Not too happy|2 Pretty happy -1.77139024 0.0298745990 -59.294193       0
2 Pretty happy|3 Very happy     1.02437736 0.0286348190  35.773838       0

Handling time component

In [18]:
# look at distribution of responses by year

library(tidyr)

# Plot response counts and proportions
plot_resp = function(resp) {
    
    # count number of each response each year
    counts = data.frame()
    options = levels(GSS[1,resp])
    
    for (i in 1972:2018) {
        
        oneyear = c(i)
        for (j in 1:length(options)) {
            oneyear = c(oneyear, 
                        sum(GSS[GSS[resp]==options[j] & GSS$`Gss year for this respondent`==i, "counter"], na.rm = T))
        }
        oneyear = c(oneyear, sum(GSS[is.na(GSS[resp]) & GSS$`Gss year for this respondent`==i, "counter"], na.rm=T))

        counts = rbind(counts, oneyear)      
    }
    
    options = c(options, "NA")
    colnames(counts) = c("Year", options)
    #print(head(counts))
    counts_long <- gather(counts, Response, Count, options[1]:options[length(options)], factor_key=TRUE)
    #print(head(counts_long))

    # calculate proportions of responses
    prop = c()
    for (i in 1:nrow(counts_long)) {
        prop = c(prop, 
                 counts_long$Count[i]/sum(counts_long[counts_long$Year==counts_long[i, "Year"], "Count"]))
    }
    prop[is.na(prop)] = 0
    counts_long$Proportion = prop
    print(head(counts_long))

    return(counts_long)
}

# JOB OR HOUSEWORK

jobsat_counts = plot_resp("Job or housework")

# make the graphs wider
options(repr.plot.width=16, repr.plot.height=6)

# plot absolute frequency
ggplot(jobsat_counts, aes(fill=Response, x=Year, y=Count)) +
  geom_bar(position="dodge", stat="identity", width=0.9) +
  ggtitle("Job/housework Satisfaction Response Count by Year") +
  theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))

# plot relative frequency
ggplot(jobsat_counts, aes(fill=Response, x=Year, y=Proportion)) +
  geom_bar(stat="identity", width=0.9) +
  ggtitle("Job/housework Satisfaction Response Proportion by Year, Including NAs") +
  theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))
  #scale_x_discrete(breaks=seq(1970, 2020, 5))

# plot relative frequency, no NAs

jobsat_noNA = jobsat_counts[jobsat_counts$Response!="NA",]
#tail(jobsat_noNA)
# calculate proportions of responses, no NAs
prop = c()
for (i in 1:nrow(jobsat_noNA)) {
    prop = c(prop, 
             jobsat_noNA$Count[i]/sum(jobsat_noNA[jobsat_noNA$Year==jobsat_noNA[i, "Year"], "Count"]))
}
prop[is.na(prop)] = 0
jobsat_noNA$Proportion = prop
print(head(jobsat_noNA))

ggplot(jobsat_noNA, aes(fill=Response, x=Year, y=Proportion)) +
  geom_bar(stat="identity", width=0.9) +
  ggtitle("Job/housework Satisfaction Response Proportion by Year, Excluding NAs") +
  theme(axis.text.x = element_text(angle = 90), legend.position="bottom", plot.title = element_text(size=20))

# proportions don't change much, data may be indep of year
Attaching package: ‘tidyr’


The following objects are masked from ‘package:expss’:

    contains, nest


  Year            Response Count Proportion
1 1972 1 Very dissatisfied    33 0.03495763
2 1973 1 Very dissatisfied    49 0.04294479
3 1974 1 Very dissatisfied    57 0.04660670
4 1975 1 Very dissatisfied    48 0.04120172
5 1976 1 Very dissatisfied    53 0.04472574
6 1977 1 Very dissatisfied    35 0.02773376
  Year            Response Count Proportion
1 1972 1 Very dissatisfied    33 0.03495763
2 1973 1 Very dissatisfied    49 0.04294479
3 1974 1 Very dissatisfied    57 0.04660670
4 1975 1 Very dissatisfied    48 0.04120172
5 1976 1 Very dissatisfied    53 0.04472574
6 1977 1 Very dissatisfied    35 0.02773376

Ordinal Regressions sandbox with time component

MVORD library: https://cran.r-project.org/web/packages/mvord/vignettes/vignette_mvord.pdf

The R package mvord implements composite likelihood estimation in the class of multivariate ordinal regression models with a multivariate probit and a multivariate logit link. A flexible modeling framework for multiple ordinal measurements on the same subject is set up, which takes into consideration the dependence among the multiple observations by employing different error structures. Heterogeneity in the error structure across the subjects can be accounted for by the package, which allows for covariate dependent error structures. In addition, different regression coefficients and threshold parameters for each response are supported. If a reduction of the parameter space is desired, constraints on the threshold as well as on the regression coefficients can be specified by the user. The proposed multivariate framework is illustrated by means of a credit risk application

In [84]:
library(dplyr)
names(GSS)<-make.names(names(GSS),unique = TRUE)
GSS = rename(GSS, jobsat_num = NA.)
GSS = rename(GSS, Gss.year.for.this.respondent = Gss.year.for.this.respondent.......................)
In [85]:
library(dplyr)

#select subset of data
subset_dat <- GSS$Gss.year.for.this.respondent %in% c("2016","2018") #"2010","2012","2014", takes too long to run
GSS_subset <- GSS[subset_dat,]


# Add columns her to use in mvord/mixor
GSS_job_sat = data.frame(subset(GSS_subset, select=c("Job.or.housework",
                                                     "jobsat_num",
                                                     "R.self.emp.or.works.for.somebody",
                                                     "Respondent.id.number",
                                                     "Gss.year.for.this.respondent",
                                                     "Respondents.income", 
                                                     "How.often.does.r.find.work.stressful",
                                                     "Number.of.hours.usually.work.a.week",
                                                     "Age.of.respondent"
                                                    )))
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Job.or.housework), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$R.self.emp.or.works.for.somebody), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Respondents.income), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$How.often.does.r.find.work.stressful), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Number.of.hours.usually.work.a.week), ]
GSS_job_sat = GSS_job_sat[!is.na(GSS_job_sat$Age.of.respondent), ]
GSS_job_sat = na.omit(GSS_job_sat)
print(head(GSS_job_sat))
       Job.or.housework jobsat_num R.self.emp.or.works.for.somebody
8915   4 Very satisfied          1                   Selftoemployed
9889   3 Mod  satisfied          2                     Someone else
15418  4 Very satisfied          1                     Someone else
15908  4 Very satisfied          1                     Someone else
16385 2 A little dissat          3                     Someone else
18599  3 Mod  satisfied          2                     Someone else
      Respondent.id.number Gss.year.for.this.respondent Respondents.income
8915                   801                         2018   k 20000 to 24999
9889                  2151                         2018    l 25000 or more
15418                 2230                         2016    l 25000 or more
15908                   17                         2016   k 20000 to 24999
16385                 1703                         2018    l 25000 or more
18599                 1711                         2016    l 25000 or more
      How.often.does.r.find.work.stressful Number.of.hours.usually.work.a.week
8915                               1 Never                                  30
9889                           3 Sometimes                                  70
15418                          3 Sometimes                                  45
15908                          3 Sometimes                                  40
16385                             5 Always                                  12
18599                              4 Often                                  65
      Age.of.respondent
8915                 65
9889                 46
15418                40
15908                56
16385                57
18599                41
In [121]:
# using mvord library
# only using 2016 and 2018 data where none of the values listed below are NA
# mvord is not able to handle NAs

library(mvord)
mvord_1 <- mvord(data = GSS_job_sat,
                 formula = MMO(Job.or.housework,Respondent.id.number,Gss.year.for.this.respondent) ~ 
                     Age.of.respondent,
    error.structure = cor_ar1(~ 1), 
    link = mvlogit(), 
    PL.lag = 2)

cat("made model")
print(summary(mvord_1))
In [22]:
prob_job_sat = prop.table(xtabs(counter ~ Job.or.housework, data=GSS))
prob_job_sat
income_dist = aggregate(GSS$counter, by=list(Job.or.housework=GSS$Job.or.housework,Respondents.income=GSS$Respondents.income), FUN=sum)


prob_job_sat_income = prop.table(xtabs(x ~ Job.or.housework+Respondents.income, data=income_dist))
prob_job_sat_income
summary(prob_job_sat_income) # they are independent, job satisfaction and income
options(repr.plot.width=12, repr.plot.height=12)
dotchart(prob_job_sat_income)
# mosaicplot(prob_job_sat_income)
Job.or.housework
1 Very dissatisfied   2 A little dissat    3 Mod  satisfied    4 Very satisfied 
         0.04152924          0.09875776          0.37958878          0.48012422 
                     Respondents.income
Job.or.housework       a Lt  1000 b 1000 to 2999 c 3000 to 3999 d 4000 to 4999
  1 Very dissatisfied 0.002198994    0.002650842    0.001596530    0.001385667
  2 A little dissat   0.004247372    0.005422177    0.003765400    0.003434045
  3 Mod  satisfied    0.011778173    0.017863060    0.012772238    0.011808296
  4 Very satisfied    0.013736181    0.019549959    0.014308522    0.010723861
                     Respondents.income
Job.or.housework      e 5000 to 5999 f 6000 to 6999 g 7000 to 7999
  1 Very dissatisfied    0.001626653    0.001265175    0.001626653
  2 A little dissat      0.003283429    0.003283429    0.003042444
  3 Mod  satisfied       0.011145585    0.010272013    0.010844353
  4 Very satisfied       0.012862608    0.011838419    0.010482875
                     Respondents.income
Job.or.housework      h 8000 to 9999 i 10000 to 14999 j 15000 to 19999
  1 Very dissatisfied    0.002138747      0.005723409      0.004066633
  2 A little dissat      0.006145134      0.014609754      0.010663614
  3 Mod  satisfied       0.018706510      0.053408441      0.041600145
  4 Very satisfied       0.020272916      0.059222219      0.047413923
                     Respondents.income
Job.or.housework      k 20000 to 24999 l 25000 or more
  1 Very dissatisfied      0.003675031     0.009217700
  2 A little dissat        0.010241889     0.029309877
  3 Mod  satisfied         0.042172485     0.143356327
  4 Very satisfied         0.048046510     0.211193783
Call: xtabs(formula = x ~ Job.or.housework + Respondents.income, data = income_dist)
Number of cases in table: 1 
Number of factors: 2 
Test for independence of all factors:
	Chisq = 0.014742, df = 33, p-value = 1
	Chi-squared approximation may be incorrect
In [23]:
income_dist = aggregate(GSS$counter, by=list(Job.or.housework=GSS$Job.or.housework,Respondents.income=GSS$Respondents.income), FUN=sum)
print(head(income_dist))
glm_sat_income = glm(Job.or.housework ~ Respondents.income, data=income_dist, family=binomial)
summary(glm_sat_income)
     Job.or.housework Respondents.income   x
1 1 Very dissatisfied         a Lt  1000  73
2   2 A little dissat         a Lt  1000 141
3    3 Mod  satisfied         a Lt  1000 391
4    4 Very satisfied         a Lt  1000 456
5 1 Very dissatisfied     b 1000 to 2999  88
6   2 A little dissat     b 1000 to 2999 180
Call:
glm(formula = Job.or.housework ~ Respondents.income, family = binomial, 
    data = income_dist)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6651   0.1526   0.7585   0.7585   0.7585  

Coefficients:
                                     Estimate Std. Error z value Pr(>|z|)
(Intercept)                         1.099e+00  1.155e+00   0.951    0.341
Respondents.incomeb 1000 to 2999    5.087e-16  1.633e+00   0.000    1.000
Respondents.incomec 3000 to 3999    1.778e-15  1.633e+00   0.000    1.000
Respondents.incomed 4000 to 4999    1.336e-15  1.633e+00   0.000    1.000
Respondents.incomee 5000 to 5999    1.414e-15  1.633e+00   0.000    1.000
Respondents.incomef 6000 to 6999    6.301e-16  1.633e+00   0.000    1.000
Respondents.incomeg 7000 to 7999    1.142e-15  1.633e+00   0.000    1.000
Respondents.incomeh 8000 to 9999    6.306e-16  1.633e+00   0.000    1.000
Respondents.incomei 10000 to 14999  2.975e-15  1.633e+00   0.000    1.000
Respondents.incomej 15000 to 19999  1.728e-15  1.633e+00   0.000    1.000
Respondents.incomek 20000 to 24999  5.795e-16  1.633e+00   0.000    1.000
Respondents.incomel 25000 or more  -7.252e-16  1.633e+00   0.000    1.000

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 53.984  on 47  degrees of freedom
Residual deviance: 53.984  on 36  degrees of freedom
AIC: 77.984

Number of Fisher Scoring iterations: 4
In [26]:
# using mixor library
library(mixor)

# mixor_df = GSS_job_sat[order(GSS_job_sat$Respondent.id.number, GSS_job_sat$Gss.year.for.this.respondent),]
mixor_df = GSS[order(GSS$Respondent.id.number, GSS$Gss.year.for.this.respondent),]

mixor_1<-mixor(jobsat_num ~ factor(Gss.year.for.this.respondent) + 
 #                factor(R.self.emp.or.works.for.somebody),# +
                 factor(Respondents.income),# +
#                  factor(How.often.does.r.find.work.stressful),# +
#                  factor(Number.of.hours.usually.work.a.week), 
               data=mixor_df, id=Respondent.id.number)

summary(mixor_1)
# No idea why all of these are 0
Warning message in model.matrix.default(mt, mf, contrasts):
“non-list contrasts argument ignored”
Call:
mixor(formula = jobsat_num ~ factor(Gss.year.for.this.respondent) + 
    factor(Respondents.income), data = mixor_df, id = Respondent.id.number)

Deviance =         71673.59 
Log-likelihood =  -2482274 
RIDGEMAX =         0 
AIC =             -35882.8 
SBC =             -36024.83 

                                           Estimate Std. Error z value P(>|z|)
(Intercept)                                      NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1973         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1974         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1975         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1976         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1977         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1978         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1980         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1982         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1983         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1984         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1985         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1986         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1987         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1988         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1989         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1990         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1991         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1993         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1994         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1996         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)1998         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2000         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2002         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2004         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2006         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2008         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2010         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2012         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2014         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2016         NA         NA      NA      NA
factor(Gss.year.for.this.respondent)2018         NA         NA      NA      NA
factor(Respondents.income)b 1000 to 2999         NA         NA      NA      NA
factor(Respondents.income)c 3000 to 3999         NA         NA      NA      NA
factor(Respondents.income)d 4000 to 4999         NA         NA      NA      NA
factor(Respondents.income)e 5000 to 5999         NA         NA      NA      NA
factor(Respondents.income)f 6000 to 6999         NA         NA      NA      NA
factor(Respondents.income)g 7000 to 7999         NA         NA      NA      NA
factor(Respondents.income)h 8000 to 9999         NA         NA      NA      NA
factor(Respondents.income)i 10000 to 14999       NA         NA      NA      NA
factor(Respondents.income)j 15000 to 19999       NA         NA      NA      NA
factor(Respondents.income)k 20000 to 24999       NA         NA      NA      NA
factor(Respondents.income)l 25000 or more        NA         NA      NA      NA
Random.(Intercept)                              0.1         NA      NA      NA
Threshold2                                       NA         NA      NA      NA
Threshold3                                       NA         NA      NA      NA

More visuals

In [28]:
# Great resource: https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
# Plot continuous predictors

continuous_factors_plot = data.frame(
    "Age.of.respondent",
    "How.many.in.family.earned.money",
    "Number.in.household.not.related",
    "Household.members.18.yrs.and.older",
    "Household.members.13.thru.17.yrs.old",
    "Household.members.6.thru.12.yrs.old",
    "Household.members.less.than.6.yrs.old",
    "Number.of.persons.in.household",
    "Total.family.income",
    "Respondents.income",
    "Number.of.employees..rs.work.site",
    "How.often.does.r.find.work.stressful",
    "Size.of.place.in.1000s",
    "Number.of.hours.usually.work.a.week",
    "Number.of.hours.worked.last.week",
    "Number.of.children",
    "Highest.year.of.school.completed",
    "Gss.year.for.this.respondent"
)

GSS = GSS[!is.na(GSS$Job.or.housework), ]
options(repr.plot.width=5, repr.plot.height=5)
for (i in continuous_factors_plot) {
    print(i)
    plot = ggplot(GSS, aes(x = GSS[[toString(i)]],y=Job.or.housework)) +
      geom_boxplot(size = .75) +
      geom_jitter(alpha = .5) +
      xlab(i)+
     # facet_grid(Respondents.income ~ How.often.does.r.find.work.stressful, margins = TRUE) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
    print(plot)
}
[1] Age.of.respondent
Levels: Age.of.respondent
Warning message:
“Removed 142 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 142 rows containing missing values (geom_point).”
[1] How.many.in.family.earned.money
Levels: How.many.in.family.earned.money
Warning message:
“Removed 346 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 346 rows containing missing values (geom_point).”
[1] Number.in.household.not.related
Levels: Number.in.household.not.related
Warning message:
“Removed 10924 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 10924 rows containing missing values (geom_point).”
[1] Household.members.18.yrs.and.older
Levels: Household.members.18.yrs.and.older
Warning message:
“Removed 56 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 56 rows containing missing values (geom_point).”
[1] Household.members.13.thru.17.yrs.old
Levels: Household.members.13.thru.17.yrs.old
Warning message:
“Removed 251 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 251 rows containing missing values (geom_point).”
[1] Household.members.6.thru.12.yrs.old
Levels: Household.members.6.thru.12.yrs.old
Warning message:
“Removed 322 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 322 rows containing missing values (geom_point).”
[1] Household.members.less.than.6.yrs.old
Levels: Household.members.less.than.6.yrs.old
Warning message:
“Removed 300 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 300 rows containing missing values (geom_point).”
[1] Number.of.persons.in.household
Levels: Number.of.persons.in.household
Warning message:
“Removed 114 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 114 rows containing missing values (geom_point).”
[1] Total.family.income
Levels: Total.family.income
[1] Respondents.income
Levels: Respondents.income
[1] Number.of.employees..rs.work.site
Levels: Number.of.employees..rs.work.site
[1] How.often.does.r.find.work.stressful
Levels: How.often.does.r.find.work.stressful
[1] Size.of.place.in.1000s
Levels: Size.of.place.in.1000s
[1] Number.of.hours.usually.work.a.week
Levels: Number.of.hours.usually.work.a.week
Warning message:
“Removed 45562 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 45562 rows containing missing values (geom_point).”
[1] Number.of.hours.worked.last.week
Levels: Number.of.hours.worked.last.week
Warning message:
“Removed 12110 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 12110 rows containing missing values (geom_point).”
[1] Number.of.children
Levels: Number.of.children
Warning message:
“Removed 114 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 114 rows containing missing values (geom_point).”
[1] Highest.year.of.school.completed
Levels: Highest.year.of.school.completed
Warning message:
“Removed 5500 rows containing missing values (stat_boxplot).”
Warning message:
“Removed 5500 rows containing missing values (geom_point).”
[1] Gss.year.for.this.respondent
Levels: Gss.year.for.this.respondent
In [30]:
# get total response rate for all respondents for all of time:
response_dist = aggregate(GSS$counter, by=list(Respondent.id.number=GSS$Respondent.id.number), FUN=sum)
print(hist(response_dist$x))
response_rate = aggregate(GSS$counter/33, by=list(Respondent.id.number=GSS$Respondent.id.number), FUN=sum)
mean(response_rate$x)
response_rate

# is respondent 1 the same every year?
respondent_1 = GSS[ which(GSS$Respondent.id.number==1), ]
respondent_1
# NO, trickery.
$breaks
 [1]  0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30

$counts
 [1] 926 183 350 339 268 129  42  31  35 126 316 502 352 140  33

$density
 [1] 0.122746554 0.024257688 0.046394486 0.044936373 0.035524920 0.017099682
 [7] 0.005567338 0.004109226 0.004639449 0.016702015 0.041887593 0.066542948
[13] 0.046659597 0.018557794 0.004374337

$mids
 [1]  1  3  5  7  9 11 13 15 17 19 21 23 25 27 29

$xname
[1] "response_dist$x"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
0.375092387287509
A data.frame: 3772 × 2
Respondent.id.numberx
<int><dbl>
10.7272727
20.8181818
30.7272727
40.8484848
50.6060606
60.6969697
70.8181818
80.6363636
90.8787879
100.6969697
110.7272727
120.6666667
130.6666667
140.7272727
150.7272727
160.8787879
170.6969697
180.7272727
190.7878788
200.6363636
210.7575758
220.7878788
230.6969697
240.6363636
250.5757576
260.6666667
270.6666667
280.7272727
290.7575758
300.7272727
44500.03030303
44520.03030303
44530.03030303
44550.03030303
44570.03030303
44610.03030303
44620.03030303
44690.03030303
44700.03030303
44710.03030303
44720.03030303
44730.03030303
44740.03030303
44750.03030303
44770.03030303
44810.03030303
44830.03030303
44850.03030303
44870.03030303
44890.03030303
44900.03030303
44910.03030303
44930.03030303
44940.03030303
45010.03030303
45020.03030303
45050.03030303
45060.03030303
45090.03030303
45100.03030303
A data.frame: 24 × 48
Rs.highest.degreeRespondents.sexRace.of.respondentType.of.place.lived.in.when.16.yrs.oldRegion.of.residence..age.16Geographic.mobility.since.age.16Living.with.parents.when.16.yrs.oldSpouse.labor.force.statusEver.been.divorced.or.separatedLabor.force.statusNumber.of.persons.in.householdNumber.of.hours.usually.work.a.weekNumber.of.hours.worked.last.weekRespondent.id.numberNumber.of.childrenHighest.year.of.school.completedAge.of.respondentGss.year.for.this.respondentjobsat_numcounter
<fct><fct><fct><fct><fct><fct><fct><fct><fct><fct><int><int><int><int><int><int><int><int><int><dbl>
10231 Lt high schoolMale WhiteTown lt 50000 Foreign Different state Mother Keeping house NoWorking fulltime 3NA2711 654197311
27701 Lt high schoolFemaleWhiteCountry nonfarm New england Same city Mother NA NAKeeping house 3NANA13NA24198341
34071 Lt high schoolMale WhiteTown lt 50000 New england Same city Mother and fatherKeeping house NoWorking fulltime 4NA4013NA41198241
41991 Lt high schoolMale WhiteCity gt 250000 New england Same city Mother and fatherNA NATemp not working 140NA10NA62198011
123081 Lt high schoolFemaleWhiteFarm New england Same st dif cityMother and fatherNA NAWorking fulltime 1NA4011 863197711
187192 High school Male WhiteTown lt 50000 W sou centralSame st dif cityMother and fatherNA NAWorking fulltime 1NA40121260199821
194412 High school Male WhiteCity gt 250000 Middle atlanticSame city Mother and fatherNA NAWorking fulltime 2NA40101428198631
237712 High school Male White50000 to 250000 Foreign Different state Female relative NA NAWorking fulltime 2NA35111143199321
310362 High school FemaleWhiteTown lt 50000 W sou centralSame city Mother and fatherNA NAWorking fulltime 1NA40101425200221
323412 High school FemaleWhite50000 to 250000 New england Same st dif cityMother and fatherRetired NoKeeping house NANANA181252197831
342682 High school FemaleBlackCity gt 250000 Middle atlanticSame city Mother and fatherNA NAKeeping house 6NANA121233199421
370902 High school FemaleWhite50000 to 250000 New england Same city Mother and fatherWorking fulltimeNoWorking parttime 2NA16111256197611
439952 High school FemaleBlackCity gt 250000 Foreign Different state Mother and fatherNA NAWorking fulltime 3NA3513NA50200611
467422 High school FemaleWhiteCity gt 250000 Foreign Different state Mother NA NAWorking fulltime 1NA40121261199111
482533 Junior collegeMale WhiteTown lt 50000 Middle atlanticDifferent state Mother and fatherNA NATemp not working 541NA101443201811
524114 Bachelor Male OtherCity gt 250000 Middle atlanticSame city Mother and fatherNA NAWorking fulltime 1NA55101631201021
525764 Bachelor Male WhiteBigtocity suburbMiddle atlanticDifferent state Mother NA NAWorking fulltime 1NA60101653201421
529944 Bachelor FemaleWhiteTown lt 50000 New england Different state Mother and fatherNA NAWorking fulltime 1NA40101665199031
540444 Bachelor Male WhiteCity gt 250000 W sou centralSame city Mother and fatherNA NAWorking fulltime 2NA40101626200011
558094 Bachelor Male WhiteTown lt 50000 New england Same st dif cityMother and fatherNA NAWorking parttime 3NA15101622201211
565424 Bachelor FemaleWhiteBigtocity suburbMiddle atlanticDifferent state Father NA NAWorking fulltime 1NANA101623197231
581354 Bachelor Male WhiteBigtocity suburbMiddle atlanticSame city Mother and fatherWorking parttimeNoWorking fulltime 2NA40101633198521
598624 Bachelor Male WhiteTown lt 50000 Middle atlanticDifferent state Mother and fatherKeeping house NoWorking fulltime 5NA50131647201621
603725 Graduate Male WhiteBigtocity suburbMiddle atlanticDifferent state Mother and fatherNA NAWorking fulltime 1NA40101632198741

Ordinal Regression Sandbox (no time component)

Hypothesis: factors directly related to job vs. others

In [87]:
# randomly pick 1 year (after 2008 crisis just in case)
set.seed(99)
yr = sample(seq(2010, 2018, by=2), 1) # only have data every other year
yr

subset_yr <- GSS$Gss.year.for.this.respondent == yr
GSS_yr <- GSS[subset_yr,]
2010
In [114]:
# split data into groups by category (otherwise models don't work)

# job-related variables
job_rel = data.frame(subset(GSS_yr, select=c(Job.or.housework,
#     Does.r.supervise.others.at.work.in.main.job,
    How.often.does.r.find.work.stressful,
    R.self.emp.or.works.for.somebody,
    Labor.force.status,
    Respondents.income,
    Number.of.employees..rs.work.site,
#     Rs industry code (naics 2007), # too many categories
#     Rs census occupation code (2010), # too many categories
#     Ever work as long as one year, # doesn't work, only 887 not NAs: length(job_rel$Ever.work.as.long.as.one.year[!is.na(job_rel$Ever.work.as.long.as.one.year)])
#     Number of hours usually work a week, # doesn't work, only 40 not NAs
    Number.of.hours.worked.last.week)))

# family-related variables
job_family = data.frame(subset(GSS_yr, select=c(Job.or.housework,
    Marital.status,
    Spouse.labor.force.status,
    Number.of.children,
    How.many.in.family.earned.money,
    Number.in.household.not.related,
    Household.members.18.yrs.and.older,
    Household.members.13.thru.17.yrs.old,
    Household.members.6.thru.12.yrs.old,
    Household.members.less.than.6.yrs.old,
    Number.of.persons.in.household,
    Total.family.income)))
#head(job_family)

# personal history variables
job_past = data.frame(subset(GSS_yr, select=c(Job.or.housework,
#    Reason.not.living.with.parents, failed algorithm did not converge
    Was.r.born.in.this.country,
    Living.with.parents.when.16.yrs.old,
    Geographic.mobility.since.age.16,
    Region.of.residence..age.16,
    Type.of.place.lived.in.when.16.yrs.old,
    Ever.been.divorced.or.separated))) # family?
#head(job_past)

# other variables
job_other = data.frame(subset(GSS_yr, select=c(Job.or.housework,
    Region.of.interview,
    Political.party.affiliation,
    Condition.of.health,
    Race.of.respondent,
    Respondents.sex,
    Rs.highest.degree,
    Age.of.respondent,
    Highest.year.of.school.completed)))
#head(job_other)

job_all = cbind(job_rel, job_family[,-1], job_past[,-1], job_other[,-1])
#head(job_all)

job_rel_family = cbind(job_rel, job_family[,-1])
job_rel_past = cbind(job_rel, job_past[,-1])
job_rel_other = cbind(job_rel, job_other[,-1])
In [105]:
print(head(job_all))
       Job.or.housework How.often.does.r.find.work.stressful
166    3 Mod  satisfied                                 <NA>
175    3 Mod  satisfied                              4 Often
246    3 Mod  satisfied                                 <NA>
279    3 Mod  satisfied                              4 Often
305 1 Very dissatisfied                          3 Sometimes
361    4 Very satisfied                          3 Sometimes
    R.self.emp.or.works.for.somebody Labor.force.status Respondents.income
166                     Someone else      Keeping house               <NA>
175                     Someone else   Working fulltime     f 6000 to 6999
246                     Someone else   Unempl  laid off               <NA>
279                     Someone else   Working fulltime    l 25000 or more
305                     Someone else   Working fulltime    l 25000 or more
361                     Someone else   Working fulltime     b 1000 to 2999
    Number.of.employees..rs.work.site Number.of.hours.worked.last.week
166                              <NA>                               NA
175                        b 10 to 49                               40
246                              <NA>                               NA
279                        b 10 to 49                               50
305                          a 1 to 9                               55
361                          a 1 to 9                               30
    Marital.status Spouse.labor.force.status Number.of.children
166       Divorced                      <NA>                  3
175        Married          Working fulltime                  4
246  Never married                      <NA>                  0
279       Divorced                      <NA>                  1
305        Married          Working fulltime                  1
361      Separated                      <NA>                  3
    How.many.in.family.earned.money Number.in.household.not.related
166                               0                              NA
175                               2                               0
246                               3                               0
279                               1                              NA
305                               2                              NA
361                               3                               1
    Household.members.18.yrs.and.older Household.members.13.thru.17.yrs.old
166                                  1                                    0
175                                  3                                    2
246                                  3                                    0
279                                  1                                    0
305                                  2                                    0
361                                  3                                    0
    Household.members.6.thru.12.yrs.old Household.members.less.than.6.yrs.old
166                                   0                                     0
175                                   0                                     1
246                                   0                                     0
279                                   0                                     0
305                                   0                                     0
361                                   0                                     0
    Number.of.persons.in.household Total.family.income
166                              1    i 10000 to 14999
175                              6    k 20000 to 24999
246                              3    j 15000 to 19999
279                              1     l 25000 or more
305                              2     l 25000 or more
361                              3    i 10000 to 14999
    Reason.not.living.with.parents Was.r.born.in.this.country
166                           <NA>                        Yes
175              Divorce separated                         No
246              Divorce separated                        Yes
279                           <NA>                        Yes
305                           <NA>                        Yes
361                    Parent died                        Yes
    Living.with.parents.when.16.yrs.old Geographic.mobility.since.age.16
166                  Mother  and father                        Same city
175               Father  and stpmother                  Different state
246                              Mother                        Same city
279                  Mother  and father                        Same city
305                  Mother  and father                 Same st dif city
361                               Other                 Same st dif city
    Region.of.residence..age.16 Type.of.place.lived.in.when.16.yrs.old
166             W  sou  central                                   Farm
175                     Foreign                       Bigtocity suburb
246                     Pacific                         City gt 250000
279              South atlantic                        Country nonfarm
305             E  sou  central                                   Farm
361              South atlantic                         City gt 250000
    Ever.been.divorced.or.separated Region.of.interview
166                            <NA>     W  sou  central
175                              No     E  nor  central
246                            <NA>             Pacific
279                            <NA>      South atlantic
305                              No     E  sou  central
361                            <NA>      South atlantic
    Political.party.affiliation Condition.of.health Race.of.respondent
166                Ind near dem                Fair              Black
175          Not str republican                Good              White
246                Ind near dem                Good              White
279                Ind near rep                Good              White
305                Ind near dem                <NA>              White
361                 Independent                Good              White
    Respondents.sex Rs.highest.degree Age.of.respondent
166          Female  1 Lt high school                62
175            Male  1 Lt high school                38
246            Male  1 Lt high school                49
279            Male  1 Lt high school                38
305            Male  1 Lt high school                49
361          Female  1 Lt high school                29
    Highest.year.of.school.completed
166                               11
175                                6
246                               11
279                                8
305                               NA
361                               NA
In [115]:
# full model - DOESN'T WORK
library(MASS)

olrmod_job_all <- polr(Job.or.housework ~ ., data = job_all, Hess = TRUE)
summary(olrmod_job_all)
Warning message:
“glm.fit: algorithm did not converge”
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Error in polr(Job.or.housework ~ ., data = job_all, Hess = TRUE): attempt to find suitable starting values failed
Traceback:

1. polr(Job.or.housework ~ ., data = job_all, Hess = TRUE)
2. stop("attempt to find suitable starting values failed")

After include too many, get error "attempt to find suitable starting values failed"

Maybe because:

"There is one fairly common circumstance in which both convergence problems and the Hauck-Donner phenomenon can occur. This is when the fitted probabilities are extremely close to zero or one. Consider a medical diagnosis problem with thousands of cases and around 50 binary explanatory variable (which may arise from coding fewer categorical variables); one of these indicators is rarely true but always indicates that the disease is present. Then the fitted probabilities of cases with that indicator should be one, which can only be achieved by taking βi = ∞. The result from glm will be warnings and an estimated coefficient of around +/- 10. There has been fairly extensive discussion of this in the statistical literature, usually claiming non-existence of maximum likelihood estimates; see Sautner and Duffy (1989, p. 234)."

https://stackoverflow.com/questions/55416164/ordinal-logistic-regression-in-r https://stackoverflow.com/questions/8596160/why-am-i-getting-algorithm-did-not-converge-and-fitted-prob-numerically-0-or

In [107]:
# construct a model for each category

# partial model: job-related variables
olrmod_job_rel <- polr(Job.or.housework ~ ., data = job_rel, Hess = TRUE)

cat("Residual Deviance: ", olrmod_job_rel$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel$deviance, olrmod_job_rel$df.residual, lower.tail = FALSE))

# summary with p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_job_rel))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message in polr(Job.or.housework ~ ., data = job_rel, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
Residual Deviance:  1813.771 
Degrees of freedom:  903 
p value:  1.792271e-63
A matrix: 27 × 4 of type dbl
ValueStd. Errort valuep value
How.often.does.r.find.work.stressful2 Hardly ever-0.620618150.340273509-1.82388030.068
How.often.does.r.find.work.stressful3 Sometimes-1.112508760.324453367-3.42887110.001
How.often.does.r.find.work.stressful4 Often-1.753940600.337253167-5.20066460.000
How.often.does.r.find.work.stressful5 Always-1.772744570.377413310-4.69709080.000
R.self.emp.or.works.for.somebodySomeone else-0.507796670.235556727-2.15572980.031
Labor.force.statusWorking fulltime 0.300607460.228350023 1.31643280.188
Respondents.incomeb 1000 to 2999-0.535423180.667820581-0.80174700.423
Respondents.incomec 3000 to 3999-0.758747500.674492576-1.12491600.261
Respondents.incomed 4000 to 4999-0.931754330.784212636-1.18813990.235
Respondents.incomee 5000 to 5999-0.305931500.733292118-0.41720280.677
Respondents.incomef 6000 to 6999-1.414330730.711081842-1.98898450.047
Respondents.incomeg 7000 to 7999-0.598600680.753860334-0.79404720.427
Respondents.incomeh 8000 to 9999-0.698664520.673740546-1.03699340.300
Respondents.incomei 10000 to 14999-0.682897470.578451934-1.18056040.238
Respondents.incomej 15000 to 19999-0.221526970.598204968-0.37031950.711
Respondents.incomek 20000 to 24999-0.172107630.587325874-0.29303600.769
Respondents.incomel 25000 or more 0.130410140.557648628 0.23385720.815
Number.of.employees..rs.work.siteb 10 to 49-0.232317400.200948823-1.15610230.248
Number.of.employees..rs.work.sitec 50to99 0.114647990.234883953 0.48810480.625
Number.of.employees..rs.work.sited 100to499-0.412485330.213551845-1.93154650.053
Number.of.employees..rs.work.sitee 500to999-0.325850530.317042775-1.02778100.304
Number.of.employees..rs.work.sitef 1000to1999-0.635119200.378980778-1.67586120.094
Number.of.employees..rs.work.siteg 2000and up-0.042426800.281263244-0.15084370.880
Number.of.hours.worked.last.week 0.011973890.006171597 1.94016140.052
1 Very dissatisfied|2 A little dissat-4.933108900.672667249-7.33365410.000
2 A little dissat|3 Mod satisfied-3.306252130.648275399-5.10007340.000
3 Mod satisfied|4 Very satisfied-1.090585750.640007646-1.70401990.088
In [108]:
# partial model: family variables
olrmod_job_family <- polr(Job.or.housework ~ ., data = job_family, Hess = TRUE)
# summary incl p vals
summary_table <- coef(summary(olrmod_job_family))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table

#Warning "glm.fit: fitted probabilities numerically 0 or 1 occurred"
#https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Warning message in polr(Job.or.housework ~ ., data = job_family, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
A matrix: 28 × 4 of type dbl
ValueStd. Errort valuep value
Spouse.labor.force.statusOther-0.682035865.486618e-01-1.243090e+000.214
Spouse.labor.force.statusRetired 0.323608094.542422e-01 7.124132e-010.476
Spouse.labor.force.statusSchool-0.960179465.273610e-01-1.820725e+000.069
Spouse.labor.force.statusTemp not working-0.486120395.752767e-01-8.450201e-010.398
Spouse.labor.force.statusUnempl laid off-0.033342305.358680e-01-6.222110e-020.950
Spouse.labor.force.statusWorking fulltime-0.244437152.865956e-01-8.528992e-010.394
Spouse.labor.force.statusWorking parttime-0.339665523.627793e-01-9.362870e-010.349
Number.of.children 0.155943476.559479e-02 2.377376e+000.017
How.many.in.family.earned.money 0.085000091.215808e-01 6.991244e-010.484
Number.in.household.not.related-0.684736624.411301e-01-1.552233e+000.121
Household.members.18.yrs.and.older 1.513280801.022382e+00 1.480152e+000.139
Household.members.13.thru.17.yrs.old 1.498164801.026508e+00 1.459477e+000.144
Household.members.6.thru.12.yrs.old 1.642188471.030438e+00 1.593679e+000.111
Household.members.less.than.6.yrs.old 1.752660581.029827e+00 1.701898e+000.089
Number.of.persons.in.household-1.754360891.023560e+00-1.713979e+000.087
Total.family.incomeb 1000 to 2999 0.927015371.534787e+00 6.040027e-010.546
Total.family.incomec 3000 to 399917.801495246.256478e-07 2.845290e+070.000
Total.family.incomed 4000 to 4999-0.864604841.883699e+00-4.589930e-010.646
Total.family.incomee 5000 to 5999 2.183204741.979604e+00 1.102849e+000.270
Total.family.incomeg 7000 to 7999-0.084357581.507653e+00-5.595293e-020.955
Total.family.incomeh 8000 to 9999-1.800072831.306926e+00-1.377333e+000.168
Total.family.incomei 10000 to 14999 0.018544631.152766e+00 1.608706e-020.987
Total.family.incomej 15000 to 19999 0.413778561.138382e+00 3.634795e-010.716
Total.family.incomek 20000 to 24999-0.519297481.060797e+00-4.895352e-010.624
Total.family.incomel 25000 or more 0.477892709.904941e-01 4.824791e-010.629
1 Very dissatisfied|2 A little dissat-3.469342551.094210e+00-3.170637e+000.002
2 A little dissat|3 Mod satisfied-2.150345921.078201e+00-1.994383e+000.046
3 Mod satisfied|4 Very satisfied-0.210255681.073944e+00-1.957790e-010.845
In [109]:
# partial model: personal history variables
olrmod_job_past <- polr(Job.or.housework ~ ., data = job_past, Hess = TRUE)

cat("Residual Deviance: ", olrmod_job_past$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_past$df.residual, "\n") 
cat("p value: ", pchisq(olrmod_job_past$deviance, olrmod_job_past$df.residual, lower.tail = FALSE))

# summary incl p vals
summary_table <- coef(summary(olrmod_job_past))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message in polr(Job.or.housework ~ ., data = job_past, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
Residual Deviance:  428.265 
Degrees of freedom:  198 
p value:  4.996144e-19
A matrix: 31 × 4 of type dbl
ValueStd. Errort valuep value
Reason.not.living.with.parentsDivorce separated-15.35208140.3261278-47.07382700.000
Reason.not.living.with.parentsOther-15.01608490.4810887-31.21271660.000
Reason.not.living.with.parentsParent died-15.22104180.3536365-43.04149260.000
Was.r.born.in.this.countryYes -0.28744030.6503681 -0.44196560.659
Living.with.parents.when.16.yrs.oldFather and stpmother 1.55717521.0068085 1.54664480.122
Living.with.parents.when.16.yrs.oldFemale relative -0.78354440.8437868 -0.92860480.353
Living.with.parents.when.16.yrs.oldM and f relatives -0.10710540.8767023 -0.12216850.903
Living.with.parents.when.16.yrs.oldMale relative 0.31840561.6353977 0.19469610.846
Living.with.parents.when.16.yrs.oldMother 0.53092220.5704126 0.93076860.352
Living.with.parents.when.16.yrs.oldMother and stpfather 0.72106610.6130789 1.17613910.240
Living.with.parents.when.16.yrs.oldOther -0.49024000.8361796 -0.58628560.558
Geographic.mobility.since.age.16Same city 0.26051420.3860670 0.67479010.500
Geographic.mobility.since.age.16Same st dif city -0.21857480.3839373 -0.56929820.569
Region.of.residence..age.16E sou central 0.68333090.6588601 1.03714120.300
Region.of.residence..age.16Foreign -0.13665970.7846935 -0.17415680.862
Region.of.residence..age.16Middle atlantic 0.71339960.5489166 1.29965020.194
Region.of.residence..age.16Mountain 1.32035460.7228657 1.82655600.068
Region.of.residence..age.16New england 1.88515211.2082399 1.56024650.119
Region.of.residence..age.16Pacific 0.69247880.5009084 1.38244590.167
Region.of.residence..age.16South atlantic 0.34650880.5001644 0.69278980.488
Region.of.residence..age.16W nor central 0.93506180.7329678 1.27572010.202
Region.of.residence..age.16W sou central 0.56691230.5618543 1.00900240.313
Type.of.place.lived.in.when.16.yrs.oldBigtocity suburb -0.58020740.5453603 -1.06389730.287
Type.of.place.lived.in.when.16.yrs.oldCity gt 250000 -0.35655280.4543500 -0.78475370.433
Type.of.place.lived.in.when.16.yrs.oldCountry nonfarm -0.37256370.5195828 -0.71704390.473
Type.of.place.lived.in.when.16.yrs.oldFarm 0.62032280.7160488 0.86631350.386
Type.of.place.lived.in.when.16.yrs.oldTown lt 50000 0.27765540.4178581 0.66447300.506
Ever.been.divorced.or.separatedYes -0.14392330.3450649 -0.41709050.677
1 Very dissatisfied|2 A little dissat-17.97493010.7827760-22.96305680.000
2 A little dissat|3 Mod satisfied-17.03394050.7671632-22.20380220.000
3 Mod satisfied|4 Very satisfied-14.91066250.7663081-19.45779150.000
In [110]:
# partial model: other variables
olrmod_job_other <- polr(Job.or.housework ~ ., data = job_other, Hess = TRUE)
# summary incl p vals
summary_table <- coef(summary(olrmod_job_other))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
A matrix: 30 × 4 of type dbl
ValueStd. Errort valuep value
Region.of.interviewE sou central 0.164282190.309664200 0.53051720.596
Region.of.interviewMiddle atlantic 0.161700770.249576218 0.64790140.517
Region.of.interviewMountain 0.462812520.305703041 1.51392840.130
Region.of.interviewNew england 0.164827600.442148355 0.37278800.709
Region.of.interviewPacific 0.440329600.241935942 1.82002560.069
Region.of.interviewSouth atlantic 0.235595220.217001008 1.08568720.278
Region.of.interviewW nor central 0.417180390.319868773 1.30422360.192
Region.of.interviewW sou central 0.289373040.271083770 1.06746720.286
Political.party.affiliationInd near rep-0.271463820.281708274-0.96363450.335
Political.party.affiliationIndependent 0.164382270.243748858 0.67439200.500
Political.party.affiliationNot str democrat-0.459504700.243384821-1.88797600.059
Political.party.affiliationNot str republican 0.396379160.247636194 1.60065120.109
Political.party.affiliationOther party-0.489795930.434975730-1.12603050.260
Political.party.affiliationStrong democrat 0.114198710.250649422 0.45561130.649
Political.party.affiliationStrong republican 0.468025720.308610202 1.51655950.129
Condition.of.healthFair-0.854378860.208271423-4.10223760.000
Condition.of.healthGood-0.372949680.165376274-2.25515830.024
Condition.of.healthPoor-0.491091480.434311113-1.13073660.258
Race.of.respondentOther 0.085462910.298485649 0.28632170.775
Race.of.respondentWhite 0.335988860.210994154 1.59240840.111
Respondents.sexMale-0.067447110.139789989-0.48248880.629
Rs.highest.degree2 High school-0.136394690.282093945-0.48350800.629
Rs.highest.degree3 Junior college-0.303616360.407673850-0.74475310.456
Rs.highest.degree4 Bachelor-0.304484030.427623345-0.71203790.476
Rs.highest.degree5 Graduate-0.217885810.532134894-0.40945600.682
Age.of.respondent 0.018663580.005029737 3.71064750.000
Highest.year.of.school.completed 0.057212600.048066133 1.19028930.234
1 Very dissatisfied|2 A little dissat-1.829165830.652928481-2.80147960.005
2 A little dissat|3 Mod satisfied-0.500635170.638047373-0.78463640.433
3 Mod satisfied|4 Very satisfied 1.649408520.639238718 2.58027000.010

Combine job-related variables with each other category separately to compare

In [112]:
# full(ish) model: job-related vs. family variables
olrmod_job_rel_family <- polr(Job.or.housework ~ ., data = job_rel_family, Hess = TRUE)#, maxit = 100)

cat("Residual Deviance: ", olrmod_job_rel_family$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_family$df.residual, "\n") 
cat("p value: ", pchisq(olrmod_job_rel_family$deviance, olrmod_job_rel_family$df.residual, lower.tail = FALSE))
# 0/1 problem?

# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_family))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Warning message in polr(Job.or.housework ~ ., data = job_rel_family, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
Residual Deviance:  680.5526 
Degrees of freedom:  344 
p value:  2.344394e-24
A matrix: 50 × 4 of type dbl
ValueStd. Errort valuep value
How.often.does.r.find.work.stressful2 Hardly ever-3.075596431.226873e+00-2.506858e+000.012
How.often.does.r.find.work.stressful3 Sometimes-3.400929701.209988e+00-2.810714e+000.005
How.often.does.r.find.work.stressful4 Often-4.046721121.227360e+00-3.297093e+000.001
How.often.does.r.find.work.stressful5 Always-4.468100971.252233e+00-3.568105e+000.000
R.self.emp.or.works.for.somebodySomeone else-0.523584244.282916e-01-1.222495e+000.222
Labor.force.statusWorking fulltime 0.377619644.316722e-01 8.747833e-010.382
Respondents.incomeb 1000 to 2999-1.207866411.610951e+00-7.497845e-010.453
Respondents.incomec 3000 to 3999 0.152815021.385966e+00 1.102589e-010.912
Respondents.incomed 4000 to 4999-0.468009501.530287e+00-3.058312e-010.760
Respondents.incomee 5000 to 5999 0.025327101.428347e+00 1.773175e-020.986
Respondents.incomef 6000 to 6999-0.575113081.366169e+00-4.209679e-010.674
Respondents.incomeg 7000 to 7999-0.804398461.834103e+00-4.385786e-010.661
Respondents.incomeh 8000 to 9999 0.516503371.361887e+00 3.792557e-010.704
Respondents.incomei 10000 to 14999-0.337607081.165754e+00-2.896041e-010.772
Respondents.incomej 15000 to 19999 1.049926071.282700e+00 8.185279e-010.413
Respondents.incomek 20000 to 24999-0.433859141.200226e+00-3.614812e-010.718
Respondents.incomel 25000 or more 0.086796461.159776e+00 7.483899e-020.940
Number.of.employees..rs.work.siteb 10 to 49-0.027112973.649666e-01-7.428890e-020.941
Number.of.employees..rs.work.sitec 50to99-0.319085474.101613e-01-7.779512e-010.437
Number.of.employees..rs.work.sited 100to499-0.965362403.708417e-01-2.603166e+000.009
Number.of.employees..rs.work.sitee 500to999-0.825273835.334588e-01-1.547024e+000.122
Number.of.employees..rs.work.sitef 1000to1999-0.013326676.782276e-01-1.964926e-020.984
Number.of.employees..rs.work.siteg 2000and up 0.074287094.798067e-01 1.548271e-010.877
Number.of.hours.worked.last.week 0.022637571.098917e-02 2.059989e+000.039
Spouse.labor.force.statusOther 0.088054477.720853e-01 1.140476e-010.909
Spouse.labor.force.statusRetired 0.590759726.676088e-01 8.848891e-010.376
Spouse.labor.force.statusSchool-0.774706836.459509e-01-1.199328e+000.230
Spouse.labor.force.statusTemp not working 0.014588728.497557e-01 1.716814e-020.986
Spouse.labor.force.statusUnempl laid off 1.144538838.262130e-01 1.385283e+000.166
Spouse.labor.force.statusWorking fulltime-0.160462623.825399e-01-4.194664e-010.675
Spouse.labor.force.statusWorking parttime-0.344704074.501849e-01-7.656944e-010.444
Number.of.children 0.174759099.073170e-02 1.926109e+000.054
How.many.in.family.earned.money 0.342020971.845892e-01 1.852876e+000.064
Number.in.household.not.related-1.257271835.706125e-01-2.203372e+000.028
Household.members.18.yrs.and.older 1.654711751.250032e+00 1.323735e+000.186
Household.members.13.thru.17.yrs.old 1.651462251.277582e+00 1.292647e+000.196
Household.members.6.thru.12.yrs.old 2.024754141.269402e+00 1.595046e+000.111
Household.members.less.than.6.yrs.old 1.793647311.273639e+00 1.408286e+000.159
Number.of.persons.in.household-2.060425211.269126e+00-1.623499e+000.104
Total.family.incomeb 1000 to 299918.757524887.834843e-08 2.394116e+080.000
Total.family.incomed 4000 to 4999-4.142163492.144385e+00-1.931633e+000.053
Total.family.incomee 5000 to 5999 1.476724052.694633e+00 5.480242e-010.584
Total.family.incomeg 7000 to 799937.399069571.623394e-15 2.303758e+160.000
Total.family.incomeh 8000 to 9999-2.043563551.700908e+00-1.201455e+000.230
Total.family.incomei 10000 to 14999-1.674082968.451365e-01-1.980843e+000.048
Total.family.incomej 15000 to 19999-0.826283619.822848e-01-8.411854e-010.400
Total.family.incomek 20000 to 24999 0.219534456.697839e-01 3.277691e-010.743
1 Very dissatisfied|2 A little dissat-7.141136701.849929e+00-3.860222e+000.000
2 A little dissat|3 Mod satisfied-5.746603911.827069e+00-3.145258e+000.002
3 Mod satisfied|4 Very satisfied-3.352394411.811444e+00-1.850675e+000.064
In [116]:
# full(ish) model: job-related vs. past variables
olrmod_job_rel_past <- polr(Job.or.housework ~ ., data = job_rel_past, Hess = TRUE)#, maxit = 100)

cat("Residual Deviance: ", olrmod_job_rel_past$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_past$df.residual, "\n")
cat("p value: ", pchisq(olrmod_job_rel_past$deviance, olrmod_job_rel_past$df.residual, lower.tail = FALSE))
# 0/1 problem?

# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_past))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Warning message in polr(Job.or.housework ~ ., data = job_rel_past, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
Residual Deviance:  819.1162 
Degrees of freedom:  412 
p value:  3.318602e-29
A matrix: 52 × 4 of type dbl
ValueStd. Errort valuep value
How.often.does.r.find.work.stressful2 Hardly ever-1.75675412720.699076573-2.51296380880.012
How.often.does.r.find.work.stressful3 Sometimes-2.17613979350.686862759-3.16823086710.002
How.often.does.r.find.work.stressful4 Often-2.79392544710.703551278-3.97117528280.000
How.often.does.r.find.work.stressful5 Always-3.14201119500.747486513-4.20343529800.000
R.self.emp.or.works.for.somebodySomeone else-0.83151150270.350962207-2.36923374170.018
Labor.force.statusWorking fulltime 0.17147910770.364708467 0.47018131790.638
Respondents.incomeb 1000 to 2999 0.04737749971.372809907 0.03451133290.972
Respondents.incomec 3000 to 3999-1.02402841711.236699584-0.82803328330.408
Respondents.incomed 4000 to 4999-1.79647976911.349746800-1.33097538690.183
Respondents.incomee 5000 to 5999-0.73593303181.246942870-0.59018985520.555
Respondents.incomef 6000 to 6999-1.34369254351.246192584-1.07823827630.281
Respondents.incomeg 7000 to 7999-0.38432469361.567189792-0.24523174880.806
Respondents.incomeh 8000 to 9999-0.48566283681.155596973-0.42027008380.674
Respondents.incomei 10000 to 14999-1.10171108821.037499452-1.06189076650.288
Respondents.incomej 15000 to 19999-0.18185190741.079664311-0.16843374880.866
Respondents.incomek 20000 to 24999-0.68666490401.043252939-0.65819599280.510
Respondents.incomel 25000 or more-0.30241512700.996352905-0.30352210090.761
Number.of.employees..rs.work.siteb 10 to 49 0.09628487100.311025861 0.30957191390.757
Number.of.employees..rs.work.sitec 50to99 0.09008776410.353392576 0.25492262810.799
Number.of.employees..rs.work.sited 100to499-0.46475079970.315316441-1.47391870400.141
Number.of.employees..rs.work.sitee 500to999-0.49502382030.492439401-1.00524819720.315
Number.of.employees..rs.work.sitef 1000to1999-0.06602809020.615457072-0.10728301480.915
Number.of.employees..rs.work.siteg 2000and up 0.50141749730.439469331 1.14096129480.254
Number.of.hours.worked.last.week 0.01193691370.009555525 1.24921587170.212
Was.r.born.in.this.countryYes 0.35080959780.539295542 0.65049600900.515
Living.with.parents.when.16.yrs.oldFather and stpmother 2.33892918541.183192225 1.97679560090.048
Living.with.parents.when.16.yrs.oldFemale relative-0.60573233691.085616692-0.55796151760.577
Living.with.parents.when.16.yrs.oldM and f relatives 0.18807465351.015716053 0.18516459690.853
Living.with.parents.when.16.yrs.oldMother 0.26789388730.736938434 0.36352275170.716
Living.with.parents.when.16.yrs.oldMother and father 0.68079808180.679680125 1.00164482820.317
Living.with.parents.when.16.yrs.oldMother and stpfather 0.92957202190.776660966 1.19688263300.231
Living.with.parents.when.16.yrs.oldOther-0.29060253560.892025066-0.32577844130.745
Geographic.mobility.since.age.16Same city 0.10187335110.282745464 0.36030056700.719
Geographic.mobility.since.age.16Same st dif city-0.26096747830.284589224-0.91699704790.359
Region.of.residence..age.16E sou central 0.41602255960.435714178 0.95480610950.340
Region.of.residence..age.16Foreign-0.20059411050.647273528-0.30990624780.757
Region.of.residence..age.16Middle atlantic 0.95906628070.371707339 2.58016504010.010
Region.of.residence..age.16Mountain 0.00030451690.535146921 0.00056903421.000
Region.of.residence..age.16New england 0.47110355300.564296927 0.83485046750.404
Region.of.residence..age.16Pacific 0.74552345730.398108156 1.87266562280.061
Region.of.residence..age.16South atlantic 0.32315323010.345890625 0.93426420710.350
Region.of.residence..age.16W nor central 0.80403573250.406100058 1.97989563430.048
Region.of.residence..age.16W sou central 0.37221110600.455888015 0.81645293130.414
Type.of.place.lived.in.when.16.yrs.oldBigtocity suburb-0.06319807210.378201145-0.16710174720.867
Type.of.place.lived.in.when.16.yrs.oldCity gt 250000 0.30761749130.386781663 0.79532594480.426
Type.of.place.lived.in.when.16.yrs.oldCountry nonfarm-0.02445256870.380620263-0.06424400130.949
Type.of.place.lived.in.when.16.yrs.oldFarm 0.33517131980.389602812 0.86028978670.390
Type.of.place.lived.in.when.16.yrs.oldTown lt 50000 0.30471990320.306254500 0.99498914660.320
Ever.been.divorced.or.separatedYes 0.39028156340.245629075 1.58890621440.112
1 Very dissatisfied|2 A little dissat-5.20493268751.514967532-3.43567276460.001
2 A little dissat|3 Mod satisfied-3.83705740151.498533270-2.56054201650.010
3 Mod satisfied|4 Very satisfied-1.54272206631.492324813-1.03377096810.301
In [117]:
# full(ish) model: job-related vs. other variables
olrmod_job_rel_other <- polr(Job.or.housework ~ ., data = job_rel_other, Hess = TRUE)

cat("Residual Deviance: ", olrmod_job_rel_other$deviance, "\n")
cat("Degrees of freedom: ", olrmod_job_rel_other$df.residual, "\n") 
cat("p value: ", pchisq(olrmod_job_rel_other$deviance, olrmod_job_rel_other$df.residual, lower.tail = FALSE))
# 0/1 problem?

# summary incl p vals
summary_table <- coef(summary(olrmod_job_rel_other))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Warning message in polr(Job.or.housework ~ ., data = job_rel_other, Hess = TRUE):
“design appears to be rank-deficient, so dropping some coefs”
Residual Deviance:  913.6301 
Degrees of freedom:  470 
p value:  8.736357e-31
A matrix: 54 × 4 of type dbl
ValueStd. Errort valuep value
How.often.does.r.find.work.stressful2 Hardly ever-1.179000700.577922516-2.040067080.041
How.often.does.r.find.work.stressful3 Sometimes-1.479854940.561893571-2.633692600.008
How.often.does.r.find.work.stressful4 Often-2.434434470.572784609-4.250174380.000
How.often.does.r.find.work.stressful5 Always-2.268206530.626989916-3.617612460.000
R.self.emp.or.works.for.somebodySomeone else-0.582528900.333973968-1.744234450.081
Labor.force.statusWorking fulltime 0.530983850.342321793 1.551124880.121
Respondents.incomeb 1000 to 2999-0.764282261.157470504-0.660303870.509
Respondents.incomec 3000 to 3999-1.850975461.139235908-1.624751680.104
Respondents.incomed 4000 to 4999-1.409954521.419806045-0.993061360.321
Respondents.incomee 5000 to 5999-1.095440171.188612809-0.921612290.357
Respondents.incomef 6000 to 6999-2.292507631.161923722-1.973027650.048
Respondents.incomeg 7000 to 7999-1.695767371.263233629-1.342402020.179
Respondents.incomeh 8000 to 9999-1.637654271.217582586-1.345004670.179
Respondents.incomei 10000 to 14999-1.075649641.020663028-1.053873430.292
Respondents.incomej 15000 to 19999-1.595566451.039441063-1.535023490.125
Respondents.incomek 20000 to 24999-1.593837281.034284502-1.541004700.123
Respondents.incomel 25000 or more-0.949158721.007750104-0.941859220.346
Number.of.employees..rs.work.siteb 10 to 49-0.294659280.300653837-0.980061610.327
Number.of.employees..rs.work.sitec 50to99 0.320488270.363039365 0.882792060.377
Number.of.employees..rs.work.sited 100to499-0.296451710.313761569-0.944831180.345
Number.of.employees..rs.work.sitee 500to999-0.161924750.478086494-0.338693420.735
Number.of.employees..rs.work.sitef 1000to1999-0.758360660.505159961-1.501228760.133
Number.of.employees..rs.work.siteg 2000and up-0.113097500.407381011-0.277620930.781
Number.of.hours.worked.last.week 0.015224910.008866982 1.717034410.086
Region.of.interviewE sou central 0.313930530.417270476 0.752343020.452
Region.of.interviewMiddle atlantic 0.115381780.342417131 0.336962650.736
Region.of.interviewMountain 0.616499910.416678949 1.479556160.139
Region.of.interviewNew england-0.302603410.554364821-0.545856090.585
Region.of.interviewPacific 0.621810040.334020431 1.861592820.063
Region.of.interviewSouth atlantic 0.471663010.299324331 1.575759000.115
Region.of.interviewW nor central 0.524478930.425705839 1.232021920.218
Region.of.interviewW sou central 0.405847110.376720867 1.077315190.281
Political.party.affiliationInd near rep-0.348674450.375307901-0.929035730.353
Political.party.affiliationIndependent 0.462441570.344265755 1.343269170.179
Political.party.affiliationNot str democrat-0.337796200.333250434-1.013640680.311
Political.party.affiliationNot str republican 0.544151700.331394862 1.642004020.101
Political.party.affiliationOther party-0.814665400.617431678-1.319442180.187
Political.party.affiliationStrong democrat 0.496620860.363943925 1.364553230.172
Political.party.affiliationStrong republican 0.751352550.420291478 1.787693990.074
Condition.of.healthFair-0.465488540.300583591-1.548615950.121
Condition.of.healthGood-0.222492190.224394236-0.991523640.321
Condition.of.healthPoor 0.748062700.918102126 0.814792470.415
Race.of.respondentOther 0.272420080.424560420 0.641652080.521
Race.of.respondentWhite 1.106980360.315100525 3.513102250.000
Respondents.sexMale-0.230125310.201892008-1.139843600.254
Rs.highest.degree2 High school-0.053373610.448377692-0.119037170.905
Rs.highest.degree3 Junior college 0.176398630.611630561 0.288407150.773
Rs.highest.degree4 Bachelor-0.191406890.657966759-0.290906620.771
Rs.highest.degree5 Graduate-0.063698940.802055483-0.079419610.937
Age.of.respondent 0.019297760.007653067 2.521571520.012
Highest.year.of.school.completed 0.088841060.069714614 1.274353500.203
1 Very dissatisfied|2 A little dissat-3.394360991.420185312-2.390083150.017
2 A little dissat|3 Mod satisfied-1.809622491.402007657-1.290736520.197
3 Mod satisfied|4 Very satisfied 0.921364071.402827636 0.656790650.511

Hand-pick a few variables that seem most important just for funsies (yes, we probably shouldn't do this)

In [118]:
# hand-picked variables
jobsat_var = data.frame(subset(GSS_yr, select=c(Job.or.housework, 
                                                Respondents.income, 
                                                Number.of.hours.worked.last.week, #Number.of.hours.usually.work.a.week doen't work 
                                                How.often.does.r.find.work.stressful, 
                                                Age.of.respondent, 
                                                Condition.of.health, 
                                                Rs.highest.degree, 
                                                Respondents.sex)))
head(jobsat_var)
A data.frame: 6 × 8
Job.or.houseworkRespondents.incomeNumber.of.hours.worked.last.weekHow.often.does.r.find.work.stressfulAge.of.respondentCondition.of.healthRs.highest.degreeRespondents.sex
<fct><fct><int><fct><int><fct><fct><fct>
1663 Mod satisfied NA NANA 62Fair1 Lt high schoolFemale
1753 Mod satisfied f 6000 to 6999 404 Often 38Good1 Lt high schoolMale
2463 Mod satisfied NA NANA 49Good1 Lt high schoolMale
2793 Mod satisfied l 25000 or more504 Often 38Good1 Lt high schoolMale
3051 Very dissatisfiedl 25000 or more553 Sometimes49NA 1 Lt high schoolMale
3614 Very satisfied b 1000 to 2999 303 Sometimes29Good1 Lt high schoolFemale
In [119]:
# full model: hand-picked variables
library(MASS)

olrmod_jobsat_var <- polr(Job.or.housework ~ ., data = jobsat_var, Hess = TRUE)

cat("Residual Deviance: ", olrmod_jobsat_var$deviance, "\n")
cat("Degrees of freedom: ", olrmod_jobsat_var$df.residual, "\n") 
cat("p value: ", pchisq(olrmod_jobsat_var$deviance, olrmod_jobsat_var$df.residual, lower.tail = FALSE))

# summary with p values for ordinal logistic regression
summary_table <- coef(summary(olrmod_jobsat_var))
pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
summary_table <- cbind(summary_table, "p value" = round(pval,3))
summary_table
Warning message:
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
Residual Deviance:  1149.107 
Degrees of freedom:  583 
p value:  2.250672e-39
A matrix: 28 × 4 of type dbl
ValueStd. Errort valuep value
Respondents.incomeb 1000 to 2999-0.150861300.950375382-0.15873860.874
Respondents.incomec 3000 to 3999-1.702137560.907239569-1.87617210.061
Respondents.incomed 4000 to 4999-0.905004621.011283580-0.89490690.371
Respondents.incomee 5000 to 5999-0.368450400.978946216-0.37637450.707
Respondents.incomef 6000 to 6999-2.072374620.940595950-2.20325700.028
Respondents.incomeg 7000 to 7999-1.251386571.078126021-1.16070530.246
Respondents.incomeh 8000 to 9999-0.756667831.001165402-0.75578700.450
Respondents.incomei 10000 to 14999-1.173746720.797464096-1.47184900.141
Respondents.incomej 15000 to 19999-0.580306650.802561603-0.72306800.470
Respondents.incomek 20000 to 24999-0.841272130.801286474-1.04990180.294
Respondents.incomel 25000 or more-0.492372840.772832358-0.63710170.524
Number.of.hours.worked.last.week 0.020787430.006596269 3.15139260.002
How.often.does.r.find.work.stressful2 Hardly ever-0.685038100.460628702-1.48718070.137
How.often.does.r.find.work.stressful3 Sometimes-1.084752230.441673101-2.45600700.014
How.often.does.r.find.work.stressful4 Often-1.991503110.455470708-4.37240660.000
How.often.does.r.find.work.stressful5 Always-2.050037580.505074275-4.05888340.000
Age.of.respondent 0.019292590.006285946 3.06916240.002
Condition.of.healthFair-0.601835220.254658761-2.36330070.018
Condition.of.healthGood-0.236681400.198113574-1.19467530.232
Condition.of.healthPoor-0.253753020.740378468-0.34273420.732
Rs.highest.degree2 High school 0.147578380.293437864 0.50292890.615
Rs.highest.degree3 Junior college 0.357156880.413618526 0.86349340.388
Rs.highest.degree4 Bachelor 0.311053090.332140434 0.93651080.349
Rs.highest.degree5 Graduate 0.587098970.365488582 1.60634010.108
Respondents.sexMale-0.094720460.170366569-0.55598030.578
1 Very dissatisfied|2 A little dissat-4.033913130.962412424-4.19145990.000
2 A little dissat|3 Mod satisfied-2.597484620.941505452-2.75886310.006
3 Mod satisfied|4 Very satisfied-0.169912070.936261771-0.18147920.856
In [120]:
# individual models: hand-picked variables

solrmod = function (data, var) {
    print(var)
    var = data[[var]]
    
    olrmod_onevar <- polr(Job.or.housework ~ var, data = jobsat_var, Hess = TRUE)

    # summary with p values for ordinal logistic regression
    summary_table <- coef(summary(olrmod_onevar))
    pval <- pnorm(abs(summary_table[, "t value"]),lower.tail = FALSE)* 2
    summary_table <- cbind(summary_table, "p value" = round(pval,3))
    print(summary_table)

    cat("Residual Deviance: ", olrmod_onevar$deviance, "\n")
    cat("Degrees of freedom: ", olrmod_onevar$df.residual, "\n") 
    cat("p value: ", pchisq(olrmod_onevar$deviance, olrmod_onevar$df.residual, lower.tail = FALSE), "\n\n")
}

for (i in 2:ncol(jobsat_var)) {
    solrmod(jobsat_var, colnames(jobsat_var)[i])
}
[1] "Respondents.income"
                                           Value Std. Error    t value p value
varb 1000 to 2999                     -1.3342906  0.5245779 -2.5435508   0.011
varc 3000 to 3999                     -1.1086170  0.5227988 -2.1205423   0.034
vard 4000 to 4999                     -1.0892066  0.5954339 -1.8292652   0.067
vare 5000 to 5999                     -0.8854388  0.5943444 -1.4897740   0.136
varf 6000 to 6999                     -2.0205324  0.5798532 -3.4845588   0.000
varg 7000 to 7999                     -1.3408573  0.6322455 -2.1207859   0.034
varh 8000 to 9999                     -1.0068281  0.5297636 -1.9005232   0.057
vari 10000 to 14999                   -1.1397779  0.4516927 -2.5233483   0.012
varj 15000 to 19999                   -0.7999584  0.4667857 -1.7137596   0.087
vark 20000 to 24999                   -0.7687473  0.4512671 -1.7035305   0.088
varl 25000 or more                    -0.3930505  0.4167804 -0.9430638   0.346
1 Very dissatisfied|2 A little dissat -3.9768982  0.4383543 -9.0723381   0.000
2 A little dissat|3 Mod  satisfied    -2.5589607  0.4171972 -6.1336958   0.000
3 Mod  satisfied|4 Very satisfied     -0.5716925  0.4097723 -1.3951467   0.163
Residual Deviance:  2320.25 
Degrees of freedom:  1094 
p value:  3.229996e-90 

[1] "Number.of.hours.worked.last.week"
                                           Value  Std. Error    t value p value
var                                    0.0130785 0.004034757   3.241459   0.001
1 Very dissatisfied|2 A little dissat -2.8050179 0.229029030 -12.247434   0.000
2 A little dissat|3 Mod  satisfied    -1.4521606 0.185770136  -7.816975   0.000
3 Mod  satisfied|4 Very satisfied      0.5631219 0.177062690   3.180353   0.001
Residual Deviance:  2351.941 
Degrees of freedom:  1128 
p value:  2.48408e-88 

[1] "How.often.does.r.find.work.stressful"
                                           Value Std. Error    t value p value
var2 Hardly ever                      -0.4493918  0.2769072  -1.622897   0.105
var3 Sometimes                        -0.7731030  0.2582715  -2.993374   0.003
var4 Often                            -1.2988446  0.2677972  -4.850106   0.000
var5 Always                           -1.6728807  0.3104757  -5.388122   0.000
1 Very dissatisfied|2 A little dissat -4.2627680  0.2894000 -14.729673   0.000
2 A little dissat|3 Mod  satisfied    -2.8819820  0.2565074 -11.235473   0.000
3 Mod  satisfied|4 Very satisfied     -0.8181188  0.2427690  -3.369948   0.001
Residual Deviance:  2379.405 
Degrees of freedom:  1153 
p value:  1.867453e-87 

[1] "Age.of.respondent"
                                            Value  Std. Error    t value
var                                    0.02546448 0.003388253   7.515520
1 Very dissatisfied|2 A little dissat -2.11522368 0.189958449 -11.135191
2 A little dissat|3 Mod  satisfied    -0.72798913 0.157826613  -4.612588
3 Mod  satisfied|4 Very satisfied      1.17249174 0.157749307   7.432627
                                      p value
var                                         0
1 Very dissatisfied|2 A little dissat       0
2 A little dissat|3 Mod  satisfied          0
3 Mod  satisfied|4 Very satisfied           0
Residual Deviance:  3233.582 
Degrees of freedom:  1522 
p value:  3.124699e-125 

[1] "Condition.of.health"
                                           Value Std. Error    t value p value
varFair                               -0.9311870  0.1818538  -5.120524   0.000
varGood                               -0.3920323  0.1482425  -2.644533   0.008
varPoor                               -0.9096892  0.3545814  -2.565530   0.010
1 Very dissatisfied|2 A little dissat -3.5603538  0.1934818 -18.401490   0.000
2 A little dissat|3 Mod  satisfied    -2.3349229  0.1452920 -16.070557   0.000
3 Mod  satisfied|4 Very satisfied     -0.3274201  0.1205034  -2.717102   0.007
Residual Deviance:  2053.904 
Degrees of freedom:  968 
p value:  3.396392e-80 

[1] "Rs.highest.degree"
                                           Value Std. Error     t value p value
var2 High school                       0.1114603  0.1486238   0.7499492   0.453
var3 Junior college                    0.1293329  0.2211834   0.5847315   0.559
var4 Bachelor                          0.3480732  0.1716671   2.0276063   0.043
var5 Graduate                          0.7757432  0.2025907   3.8291153   0.000
1 Very dissatisfied|2 A little dissat -2.9653296  0.1765861 -16.7925400   0.000
2 A little dissat|3 Mod  satisfied    -1.5953142  0.1408796 -11.3239563   0.000
3 Mod  satisfied|4 Very satisfied      0.2663325  0.1329300   2.0035549   0.045
Residual Deviance:  3280.227 
Degrees of freedom:  1521 
p value:  8.327414e-131 

[1] "Respondents.sex"
                                            Value Std. Error     t value
varMale                               -0.07736343 0.09772970  -0.7916062
1 Very dissatisfied|2 A little dissat -3.19735264 0.13679050 -23.3740838
2 A little dissat|3 Mod  satisfied    -1.83347742 0.08565987 -21.4041577
3 Mod  satisfied|4 Very satisfied      0.01274883 0.06717897   0.1897741
                                      p value
varMale                                 0.429
1 Very dissatisfied|2 A little dissat   0.000
2 A little dissat|3 Mod  satisfied      0.000
3 Mod  satisfied|4 Very satisfied       0.849
Residual Deviance:  3300.084 
Degrees of freedom:  1524 
p value:  1.265387e-132 

In [ ]: